Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>

commit: afd38f0c617d6f89b2b4532c6c44f116617e2b6f [log] [tgz]
author: Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> Wed Sep 27 17:46:17 2023 +0100
committer: felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com> Thu Sep 28 12:08:05 2023 +0000
tree: 03bc7d5a762099989b16a656fa8d397b490ed70e
parent: bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 [diff]
diff --git a/src/c/AclContext.cpp b/src/c/AclContext.cpp
index 9b8ffea..c6c0820 100644
--- a/src/c/AclContext.cpp
+++ b/src/c/AclContext.cpp

@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "arm_compute/AclEntrypoints.h"
-
 #include "arm_compute/core/Error.h"
 
 #include "src/common/IContext.h"
@@ -42,25 +41,25 @@
 template <typename ContextType>
 arm_compute::IContext *create_backend_ctx(const AclContextOptions *options)
 {
-    return new(std::nothrow) ContextType(options);
+    return new (std::nothrow) ContextType(options);
 }
 
 bool is_target_valid(AclTarget target)
 {
-    return arm_compute::utils::is_in(target, { AclCpu, AclGpuOcl });
+    return arm_compute::utils::is_in(target, {AclCpu, AclGpuOcl});
 }
 
 bool are_context_options_valid(const AclContextOptions *options)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(options);
-    return arm_compute::utils::is_in(options->mode, { AclPreferFastRerun, AclPreferFastStart });
+    return arm_compute::utils::is_in(options->mode, {AclPreferFastRerun, AclPreferFastStart});
 }
 
 arm_compute::IContext *create_context(AclTarget target, const AclContextOptions *options)
 {
     ARM_COMPUTE_UNUSED(options);
 
-    switch(target)
+    switch (target)
     {
 #ifdef ARM_COMPUTE_CPU_ENABLED
         case AclCpu:
@@ -77,24 +76,22 @@
 }
 } // namespace
 
-extern "C" AclStatus AclCreateContext(AclContext              *external_ctx,
-                                      AclTarget                target,
-                                      const AclContextOptions *options)
+extern "C" AclStatus AclCreateContext(AclContext *external_ctx, AclTarget target, const AclContextOptions *options)
 {
-    if(!is_target_valid(target))
+    if (!is_target_valid(target))
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Target is invalid!");
         return AclUnsupportedTarget;
     }
 
-    if(options != nullptr && !are_context_options_valid(options))
+    if (options != nullptr && !are_context_options_valid(options))
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Context options are invalid!");
         return AclInvalidArgument;
     }
 
     auto ctx = create_context(target, options);
-    if(ctx == nullptr)
+    if (ctx == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Couldn't allocate internal resources for context creation!");
         return AclOutOfMemory;
@@ -113,7 +110,7 @@
     StatusCode status = detail::validate_internal_context(ctx);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(ctx->refcount() != 0)
+    if (ctx->refcount() != 0)
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Context has references on it that haven't been released!");
         // TODO: Fix the refcount with callback when reaches 0

diff --git a/src/c/AclQueue.cpp b/src/c/AclQueue.cpp
index 020c6ed..c3e867b 100644
--- a/src/c/AclQueue.cpp
+++ b/src/c/AclQueue.cpp

@@ -38,7 +38,7 @@
 bool is_mode_valid(const AclQueueOptions *options)
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(options);
-    return arm_compute::utils::is_in(options->mode, { AclTuningModeNone, AclRapid, AclNormal, AclExhaustive });
+    return arm_compute::utils::is_in(options->mode, {AclTuningModeNone, AclRapid, AclNormal, AclExhaustive});
 }
 } // namespace
 
@@ -51,14 +51,14 @@
     StatusCode status = detail::validate_internal_context(ctx);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(options != nullptr && !is_mode_valid(options))
+    if (options != nullptr && !is_mode_valid(options))
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Queue options are invalid");
         return AclInvalidArgument;
     }
 
     auto queue = ctx->create_queue(options);
-    if(queue == nullptr)
+    if (queue == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return AclOutOfMemory;

diff --git a/src/c/AclTensor.cpp b/src/c/AclTensor.cpp
index 5b18469..c4cd08a 100644
--- a/src/c/AclTensor.cpp
+++ b/src/c/AclTensor.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/AclEntrypoints.h"
 #include "arm_compute/AclUtils.h"
 #include "arm_compute/core/Error.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/utils/Macros.h"
 
@@ -41,17 +42,17 @@
  */
 bool is_desc_valid(const AclTensorDescriptor &desc)
 {
-    if(desc.data_type > AclFloat32 || desc.data_type <= AclDataTypeUnknown)
+    if (desc.data_type > AclFloat32 || desc.data_type <= AclDataTypeUnknown)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Unknown data type!");
         return false;
     }
-    if(desc.ndims > max_allowed_dims)
+    if (desc.ndims > max_allowed_dims)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Dimensions surpass the maximum allowed value!");
         return false;
     }
-    if(desc.ndims > 0 && desc.shape == nullptr)
+    if (desc.ndims > 0 && desc.shape == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Dimensions values are empty while dimensionality is > 0!");
         return false;
@@ -66,10 +67,8 @@
 }
 } // namespace
 
-extern "C" AclStatus AclCreateTensor(AclTensor                 *external_tensor,
-                                     AclContext                 external_ctx,
-                                     const AclTensorDescriptor *desc,
-                                     bool                       allocate)
+extern "C" AclStatus
+AclCreateTensor(AclTensor *external_tensor, AclContext external_ctx, const AclTensorDescriptor *desc, bool allocate)
 {
     using namespace arm_compute;
 
@@ -78,14 +77,14 @@
     StatusCode status = detail::validate_internal_context(ctx);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(desc == nullptr || !is_desc_valid(*desc))
+    if (desc == nullptr || !is_desc_valid(*desc))
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Descriptor is invalid!");
         return AclInvalidArgument;
     }
 
     auto tensor = ctx->create_tensor(*desc, allocate);
-    if(tensor == nullptr)
+    if (tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclCreateTensor]: Couldn't allocate internal resources for tensor creation!");
         return AclOutOfMemory;
@@ -103,7 +102,7 @@
     StatusCode status = detail::validate_internal_tensor(tensor);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
-    if(handle == nullptr)
+    if (handle == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[AclMapTensor]: Handle object is nullptr!");
         return AclInvalidArgument;
@@ -160,12 +159,12 @@
 {
     using namespace arm_compute;
 
-    if(size == nullptr)
+    if (size == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    ITensorV2 *internal_tensor{ nullptr };
+    ITensorV2 *internal_tensor{nullptr};
     auto       status = convert_and_validate_tensor(tensor, &internal_tensor);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
@@ -177,15 +176,15 @@
 {
     using namespace arm_compute;
 
-    if(desc == nullptr)
+    if (desc == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    ITensorV2 *internal_tensor{ nullptr };
+    ITensorV2 *internal_tensor{nullptr};
     const auto status = convert_and_validate_tensor(tensor, &internal_tensor);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
     *desc = internal_tensor->get_descriptor();
     return utils::as_cenum<AclStatus>(status);
-}
\ No newline at end of file
+}

diff --git a/src/c/AclTensorPack.cpp b/src/c/AclTensorPack.cpp
index 6202524..daf1be4 100644
--- a/src/c/AclTensorPack.cpp
+++ b/src/c/AclTensorPack.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/AclEntrypoints.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/TensorPack.h"
 #include "src/common/utils/Macros.h"
@@ -36,7 +37,7 @@
 
     status = detail::validate_internal_tensor(tensor);
 
-    if(status != StatusCode::Success)
+    if (status != StatusCode::Success)
     {
         return status;
     }
@@ -57,7 +58,7 @@
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status);
 
     auto pack = new TensorPack(ctx);
-    if(pack == nullptr)
+    if (pack == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_WITH_FUNCNAME_ACL("Couldn't allocate internal resources!");
         return AclOutOfMemory;
@@ -77,14 +78,15 @@
     return AclStatus::AclSuccess;
 }
 
-extern "C" AclStatus AclPackTensors(AclTensorPack external_pack, AclTensor *external_tensors, int32_t *slot_ids, size_t num_tensors)
+extern "C" AclStatus
+AclPackTensors(AclTensorPack external_pack, AclTensor *external_tensors, int32_t *slot_ids, size_t num_tensors)
 {
     using namespace arm_compute;
 
     auto pack = get_internal(external_pack);
     ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(detail::validate_internal_pack(pack));
 
-    for(unsigned i = 0; i < num_tensors; ++i)
+    for (unsigned i = 0; i < num_tensors; ++i)
     {
         ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(PackTensorInternal(*pack, external_tensors[i], slot_ids[i]));
     }

diff --git a/src/c/AclVersion.cpp b/src/c/AclVersion.cpp
index 971189a..a659e90 100644
--- a/src/c/AclVersion.cpp
+++ b/src/c/AclVersion.cpp

@@ -25,8 +25,7 @@
 
 namespace
 {
-constexpr AclVersion version_info
-{
+constexpr AclVersion version_info{
     ARM_COMPUTE_LIBRARY_VERSION_MAJOR,
     ARM_COMPUTE_LIBRARY_VERSION_MINOR,
     ARM_COMPUTE_LIBRARY_VERSION_PATCH,

diff --git a/src/c/cl/AclOpenClExt.cpp b/src/c/cl/AclOpenClExt.cpp
index e72babc..8e42cf5 100644
--- a/src/c/cl/AclOpenClExt.cpp
+++ b/src/c/cl/AclOpenClExt.cpp

@@ -23,13 +23,12 @@
  */
 #include "arm_compute/AclOpenClExt.h"
 
+#include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/Types.h"
 #include "src/gpu/cl/ClContext.h"
 #include "src/gpu/cl/ClQueue.h"
-
-#include "arm_compute/core/CL/ICLTensor.h"
-
 #include "support/Cast.h"
 
 extern "C" AclStatus AclGetClContext(AclContext external_ctx, cl_context *opencl_context)
@@ -37,17 +36,17 @@
     using namespace arm_compute;
     IContext *ctx = get_internal(external_ctx);
 
-    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    if (detail::validate_internal_context(ctx) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(ctx->type() != Target::GpuOcl)
+    if (ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_context == nullptr)
+    if (opencl_context == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -62,23 +61,23 @@
     using namespace arm_compute;
     IContext *ctx = get_internal(external_ctx);
 
-    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    if (detail::validate_internal_context(ctx) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(ctx->type() != Target::GpuOcl)
+    if (ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(ctx->refcount() != 0)
+    if (ctx->refcount() != 0)
     {
         return AclStatus::AclUnsupportedConfig;
     }
 
     auto cl_ctx = utils::cast::polymorphic_downcast<arm_compute::gpu::opencl::ClContext *>(ctx);
-    if(!cl_ctx->set_cl_ctx(::cl::Context(opencl_context)))
+    if (!cl_ctx->set_cl_ctx(::cl::Context(opencl_context)))
     {
         return AclStatus::AclRuntimeError;
     }
@@ -91,17 +90,17 @@
     using namespace arm_compute;
     IContext *ctx = get_internal(external_ctx);
 
-    if(detail::validate_internal_context(ctx) != StatusCode::Success)
+    if (detail::validate_internal_context(ctx) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(ctx->type() != Target::GpuOcl)
+    if (ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_device == nullptr)
+    if (opencl_device == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -116,17 +115,17 @@
     using namespace arm_compute;
     IQueue *queue = get_internal(external_queue);
 
-    if(detail::validate_internal_queue(queue) != StatusCode::Success)
+    if (detail::validate_internal_queue(queue) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(queue->header.ctx->type() != Target::GpuOcl)
+    if (queue->header.ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_queue == nullptr)
+    if (opencl_queue == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -141,18 +140,18 @@
     using namespace arm_compute;
     IQueue *queue = get_internal(external_queue);
 
-    if(detail::validate_internal_queue(queue) != StatusCode::Success)
+    if (detail::validate_internal_queue(queue) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(queue->header.ctx->type() != Target::GpuOcl)
+    if (queue->header.ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
     auto cl_queue = utils::cast::polymorphic_downcast<arm_compute::gpu::opencl::ClQueue *>(queue);
-    if(!cl_queue->set_cl_queue(::cl::CommandQueue(opencl_queue)))
+    if (!cl_queue->set_cl_queue(::cl::CommandQueue(opencl_queue)))
     {
         return AclStatus::AclRuntimeError;
     }
@@ -165,17 +164,17 @@
     using namespace arm_compute;
     ITensorV2 *tensor = get_internal(external_tensor);
 
-    if(detail::validate_internal_tensor(tensor) != StatusCode::Success)
+    if (detail::validate_internal_tensor(tensor) != StatusCode::Success)
     {
         return AclStatus::AclInvalidArgument;
     }
 
-    if(tensor->header.ctx->type() != Target::GpuOcl)
+    if (tensor->header.ctx->type() != Target::GpuOcl)
     {
         return AclStatus::AclInvalidTarget;
     }
 
-    if(opencl_mem == nullptr)
+    if (opencl_mem == nullptr)
     {
         return AclStatus::AclInvalidArgument;
     }
@@ -184,4 +183,4 @@
     *opencl_mem    = cl_tensor->cl_buffer().get();
 
     return AclStatus::AclSuccess;
-}
\ No newline at end of file
+}

diff --git a/src/common/AllocatorWrapper.cpp b/src/common/AllocatorWrapper.cpp
index 7b5bb34..28d81a9 100644
--- a/src/common/AllocatorWrapper.cpp
+++ b/src/common/AllocatorWrapper.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/common/AllocatorWrapper.h"
+
 #include "arm_compute/core/Error.h"
 
 namespace arm_compute
@@ -57,7 +58,7 @@
 
 void AllocatorWrapper::set_user_data(void *user_data)
 {
-    if(user_data != nullptr)
+    if (user_data != nullptr)
     {
         _backing_allocator.user_data = user_data;
     }

diff --git a/src/common/AllocatorWrapper.h b/src/common/AllocatorWrapper.h
index 5e1f138..bbf70a2 100644
--- a/src/common/AllocatorWrapper.h
+++ b/src/common/AllocatorWrapper.h

@@ -37,8 +37,8 @@
      * @param[in] backing_allocator Backing memory allocator to be used
      */
     AllocatorWrapper(const AclAllocator &backing_allocator) noexcept;
-    AllocatorWrapper(const AllocatorWrapper &) noexcept = default;
-    AllocatorWrapper(AllocatorWrapper &&) noexcept      = default;
+    AllocatorWrapper(const AllocatorWrapper &) noexcept            = default;
+    AllocatorWrapper(AllocatorWrapper &&) noexcept                 = default;
     AllocatorWrapper &operator=(const AllocatorWrapper &) noexcept = delete;
     AllocatorWrapper &operator=(AllocatorWrapper &&other) noexcept = default;
     /** Allocate a chunk of memory of a given size in bytes
@@ -78,4 +78,4 @@
 };
 } // namespace arm_compute
 
-#endif /* SRC_COMMON_ALLOCATORWRAPPER_H */
\ No newline at end of file
+#endif /* SRC_COMMON_ALLOCATORWRAPPER_H */

diff --git a/src/common/IContext.h b/src/common/IContext.h
index 65bb767..a221e5d 100644
--- a/src/common/IContext.h
+++ b/src/common/IContext.h

@@ -33,7 +33,7 @@
 
 struct AclContext_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Context, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Context, nullptr};
 
 protected:
     AclContext_()  = default;
@@ -51,8 +51,7 @@
 class IContext : public AclContext_
 {
 public:
-    IContext(Target target)
-        : AclContext_(), _target(target), _refcount(0)
+    IContext(Target target) : AclContext_(), _target(target), _refcount(0)
     {
     }
     /** Virtual Destructor */
@@ -108,11 +107,11 @@
      *
      * @return A pointer to the created queue object
      */
-    virtual IQueue *create_queue(const AclQueueOptions *options) = 0;
-    virtual std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    virtual IQueue                             *create_queue(const AclQueueOptions *options) = 0;
+    virtual std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                                   const AclTensorDescriptor     &dst,
                                                                   const AclActivationDescriptor &act,
-                                                                  bool                           is_validate) = 0;
+                                                                  bool                           is_validate)          = 0;
 
 private:
     Target                   _target;   /**< Target type of context */
@@ -140,7 +139,7 @@
  */
 inline StatusCode validate_internal_context(const IContext *ctx)
 {
-    if(ctx == nullptr || !ctx->is_valid())
+    if (ctx == nullptr || !ctx->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Invalid context object");
         return StatusCode::InvalidArgument;

diff --git a/src/common/IOperator.cpp b/src/common/IOperator.cpp
index b56f0e9..90e3473 100644
--- a/src/common/IOperator.cpp
+++ b/src/common/IOperator.cpp

@@ -22,13 +22,13 @@
  * SOFTWARE.
  */
 #include "src/common/IOperator.h"
+
 #include "src/common/utils/Validate.h"
 
 namespace arm_compute
 {
 #ifndef DOXYGEN_SKIP_THIS
-IOperator::IOperator(IContext *ctx)
-    : AclOperator_()
+IOperator::IOperator(IContext *ctx) : AclOperator_()
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx);
     this->header.ctx = ctx;

diff --git a/src/common/IOperator.h b/src/common/IOperator.h
index 1b65a09..e86e11f 100644
--- a/src/common/IOperator.h
+++ b/src/common/IOperator.h

@@ -30,13 +30,14 @@
 // TODO: Remove when all functions have been ported
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IOperator.h"
+
 #include "src/common/utils/Validate.h"
 
 #include <vector>
 
 struct AclOperator_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Operator, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Operator, nullptr};
 
 protected:
     AclOperator_()  = default;
@@ -100,7 +101,7 @@
     }
 
 private:
-    std::unique_ptr<experimental::IOperator> _op{ nullptr };
+    std::unique_ptr<experimental::IOperator> _op{nullptr};
 };
 
 /** Extract internal representation of an Operator
@@ -124,7 +125,7 @@
  */
 inline StatusCode validate_internal_operator(const IOperator *op)
 {
-    if(op == nullptr || !op->is_valid())
+    if (op == nullptr || !op->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[IOperator]: Invalid operator object");
         return StatusCode::InvalidArgument;

diff --git a/src/common/IQueue.h b/src/common/IQueue.h
index 6a0cbc7..60745d2 100644
--- a/src/common/IQueue.h
+++ b/src/common/IQueue.h

@@ -28,7 +28,7 @@
 
 struct AclQueue_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Queue, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Queue, nullptr};
 
 protected:
     AclQueue_()  = default;
@@ -88,7 +88,7 @@
  */
 inline StatusCode validate_internal_queue(const IQueue *queue)
 {
-    if(queue == nullptr || !queue->is_valid())
+    if (queue == nullptr || !queue->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[IQueue]: Invalid queue object");
         return StatusCode::InvalidArgument;

diff --git a/src/common/ITensorV2.cpp b/src/common/ITensorV2.cpp
index 39bf1c6..bf3d963 100644
--- a/src/common/ITensorV2.cpp
+++ b/src/common/ITensorV2.cpp

@@ -22,7 +22,9 @@
  * SOFTWARE.
  */
 #include "src/common/ITensorV2.h"
+
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/common/utils/LegacySupport.h"
 
 namespace arm_compute
@@ -36,4 +38,4 @@
 {
     return detail::convert_to_descriptor(*tensor()->info());
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/common/ITensorV2.h b/src/common/ITensorV2.h
index 965aace..903bfad 100644
--- a/src/common/ITensorV2.h
+++ b/src/common/ITensorV2.h

@@ -29,7 +29,7 @@
 
 struct AclTensor_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::Tensor, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::Tensor, nullptr};
 
 protected:
     AclTensor_()  = default;
@@ -49,8 +49,7 @@
      *
      * @param[in] ctx Context to be used by the operator
      */
-    explicit ITensorV2(IContext *ctx)
-        : AclTensor_()
+    explicit ITensorV2(IContext *ctx) : AclTensor_()
     {
         ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx);
         this->header.ctx = ctx;
@@ -128,7 +127,7 @@
  */
 inline StatusCode validate_internal_tensor(const ITensorV2 *tensor)
 {
-    if(tensor == nullptr || !tensor->is_valid())
+    if (tensor == nullptr || !tensor->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ITensorV2]: Invalid tensor object");
         return StatusCode::InvalidArgument;

diff --git a/src/common/TensorPack.cpp b/src/common/TensorPack.cpp
index 6c2c7f9..b51fc0b 100644
--- a/src/common/TensorPack.cpp
+++ b/src/common/TensorPack.cpp

@@ -22,13 +22,13 @@
  * SOFTWARE.
  */
 #include "src/common/TensorPack.h"
+
 #include "src/common/ITensorV2.h"
 #include "src/common/utils/Validate.h"
 
 namespace arm_compute
 {
-TensorPack::TensorPack(IContext *ctx)
-    : AclTensorPack_(), _pack()
+TensorPack::TensorPack(IContext *ctx) : AclTensorPack_(), _pack()
 {
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(ctx);
     this->header.ctx = ctx;

diff --git a/src/common/TensorPack.h b/src/common/TensorPack.h
index f330eee..b3d1624 100644
--- a/src/common/TensorPack.h
+++ b/src/common/TensorPack.h

@@ -25,11 +25,12 @@
 #define SRC_COMMON_ITENSORPACK_H_
 
 #include "arm_compute/core/ITensorPack.h"
+
 #include "src/common/IContext.h"
 
 struct AclTensorPack_
 {
-    arm_compute::detail::Header header{ arm_compute::detail::ObjectType::TensorPack, nullptr };
+    arm_compute::detail::Header header{arm_compute::detail::ObjectType::TensorPack, nullptr};
 
 protected:
     AclTensorPack_()  = default;
@@ -118,7 +119,7 @@
  */
 inline StatusCode validate_internal_pack(const TensorPack *pack)
 {
-    if(pack == nullptr || !pack->is_valid())
+    if (pack == nullptr || !pack->is_valid())
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[TensorPack]: Invalid tensor pack object");
         return StatusCode::InvalidArgument;

diff --git a/src/common/cpuinfo/CpuInfo.cpp b/src/common/cpuinfo/CpuInfo.cpp
index cdcdea9..23a4773 100644
--- a/src/common/cpuinfo/CpuInfo.cpp
+++ b/src/common/cpuinfo/CpuInfo.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
+
 #include "support/StringSupport.h"
 #include "support/ToolchainSupport.h"
 
@@ -53,16 +54,16 @@
 #endif /* defined(__APPLE__) && defined(__aarch64__)) */
 #endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID (1 << 11)
-#define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg \
-                                                                : "=r"(var))
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID    (1 << 11)
+#define ARM_COMPUTE_GET_FEATURE_REG(var, freg) __asm __volatile("MRS %0, " #freg : "=r"(var))
 namespace arm_compute
 {
 namespace cpuinfo
 {
 namespace
 {
-#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__))
 /** Extract MIDR using CPUID information that are exposed to user-space
  *
  * @param[in] max_num_cpus Maximum number of possible CPUs
@@ -72,15 +73,15 @@
 std::vector<uint32_t> midr_from_cpuid(uint32_t max_num_cpus)
 {
     std::vector<uint32_t> cpus;
-    for(unsigned int i = 0; i < max_num_cpus; ++i)
+    for (unsigned int i = 0; i < max_num_cpus; ++i)
     {
         std::stringstream str;
         str << "/sys/devices/system/cpu/cpu" << i << "/regs/identification/midr_el1";
         std::ifstream file(str.str(), std::ios::in);
-        if(file.is_open())
+        if (file.is_open())
         {
             std::string line;
-            if(bool(getline(file, line)))
+            if (bool(getline(file, line)))
             {
                 cpus.emplace_back(support::cpp11::stoul(line, nullptr, support::cpp11::NumericBase::BASE_16));
             }
@@ -122,34 +123,35 @@
     ARM_COMPUTE_ERROR_ON_MSG(ret_status != 0, "Regex compilation failed.");
 
     std::ifstream file("/proc/cpuinfo", std::ios::in);
-    if(file.is_open())
+    if (file.is_open())
     {
         std::string line;
         int         midr   = 0;
         int         curcpu = -1;
 
-        while(bool(getline(file, line)))
+        while (bool(getline(file, line)))
         {
             std::array<regmatch_t, 2> match;
             ret_status = regexec(&proc_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string id     = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         newcpu = support::cpp11::stoi(id, nullptr);
 
-                if(curcpu >= 0 && midr == 0)
+                if (curcpu >= 0 && midr == 0)
                 {
                     // Matched a new CPU ID without any description of the previous one - looks like old format.
                     return {};
                 }
 
-                if(curcpu >= 0 && curcpu < max_num_cpus)
+                if (curcpu >= 0 && curcpu < max_num_cpus)
                 {
                     cpus.emplace_back(midr);
                 }
                 else
                 {
-                    ARM_COMPUTE_LOG_INFO_MSG_CORE("Trying to populate a core id with id greater than the expected number of cores!");
+                    ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                        "Trying to populate a core id with id greater than the expected number of cores!");
                 }
 
                 midr   = 0;
@@ -159,7 +161,7 @@
             }
 
             ret_status = regexec(&imp_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         impv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
@@ -169,7 +171,7 @@
             }
 
             ret_status = regexec(&var_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         varv   = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
@@ -179,7 +181,7 @@
             }
 
             ret_status = regexec(&part_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         partv  = support::cpp11::stoi(subexp, nullptr, support::cpp11::NumericBase::BASE_16);
@@ -189,7 +191,7 @@
             }
 
             ret_status = regexec(&rev_regex, line.c_str(), 2, match.data(), 0);
-            if(ret_status == 0)
+            if (ret_status == 0)
             {
                 std::string subexp = line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so));
                 int         regv   = support::cpp11::stoi(subexp, nullptr);
@@ -200,13 +202,14 @@
             }
         }
 
-        if(curcpu >= 0 && curcpu < max_num_cpus)
+        if (curcpu >= 0 && curcpu < max_num_cpus)
         {
             cpus.emplace_back(midr);
         }
         else
         {
-            ARM_COMPUTE_LOG_INFO_MSG_CORE("Trying to populate a core id with id greater than the expected number of cores!");
+            ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                "Trying to populate a core id with id greater than the expected number of cores!");
         }
     }
 
@@ -231,11 +234,11 @@
     CPUspresent.open("/sys/devices/system/cpu/present", std::ios::in);
     bool success = false;
 
-    if(CPUspresent.is_open())
+    if (CPUspresent.is_open())
     {
         std::string line;
 
-        if(bool(getline(CPUspresent, line)))
+        if (bool(getline(CPUspresent, line)))
         {
             /* The content of this file is a list of ranges or single values, e.g.
                  * 0-5, or 1-3,5,7 or similar.  As we are interested in the
@@ -244,9 +247,9 @@
                  */
             auto startfrom = line.begin();
 
-            for(auto i = line.begin(); i < line.end(); ++i)
+            for (auto i = line.begin(); i < line.end(); ++i)
             {
-                if(*i == '-' || *i == ',')
+                if (*i == '-' || *i == ',')
                 {
                     startfrom = i + 1;
                 }
@@ -260,13 +263,14 @@
     }
 
     // Return std::thread::hardware_concurrency() as a fallback.
-    if(!success)
+    if (!success)
     {
         max_cpus = std::thread::hardware_concurrency();
     }
     return max_cpus;
 }
-#elif defined(__aarch64__) && defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
+#elif defined(__aarch64__) && \
+    defined(__APPLE__) /* !defined(BARE_METAL) && !defined(__APPLE__) && (defined(__arm__) || defined(__aarch64__)) */
 /** Query features through sysctlbyname
   *
   * @return int value queried
@@ -278,46 +282,45 @@
     sysctlbyname(cap.c_str(), &result, &size, NULL, 0);
     return result;
 }
-#endif                                           /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
+#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
 #if defined(BARE_METAL) && defined(__aarch64__)
 uint64_t get_sve_feature_reg()
 {
     uint64_t svefr0 = 0;
-    __asm __volatile(
-        ".inst 0xd5380483 // mrs x3, ID_AA64ZFR0_EL1\n"
-        "MOV  %0, X3"
-        : "=r"(svefr0)
-        :
-        : "x3");
+    __asm __volatile(".inst 0xd5380483 // mrs x3, ID_AA64ZFR0_EL1\n"
+                     "MOV  %0, X3"
+                     : "=r"(svefr0)
+                     :
+                     : "x3");
     return svefr0;
 }
 #endif /* defined(BARE_METAL) && defined(__aarch64__) */
 } // namespace
 
-CpuInfo::CpuInfo(CpuIsaInfo isa, std::vector<CpuModel> cpus)
-    : _isa(std::move(isa)), _cpus(std::move(cpus))
+CpuInfo::CpuInfo(CpuIsaInfo isa, std::vector<CpuModel> cpus) : _isa(std::move(isa)), _cpus(std::move(cpus))
 {
 }
 
 CpuInfo CpuInfo::build()
 {
-#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__))
+#if !defined(_WIN64) && !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && \
+    (defined(__arm__) || defined(__aarch64__))
     const uint32_t hwcaps   = getauxval(AT_HWCAP);
     const uint32_t hwcaps2  = getauxval(AT_HWCAP2);
     const uint32_t max_cpus = get_max_cpus();
 
     // Populate midr values
     std::vector<uint32_t> cpus_midr;
-    if(hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID)
+    if (hwcaps & ARM_COMPUTE_CPU_FEATURE_HWCAP_CPUID)
     {
         cpus_midr = midr_from_cpuid(max_cpus);
     }
-    if(cpus_midr.empty())
+    if (cpus_midr.empty())
     {
         cpus_midr = midr_from_proc_cpuinfo(max_cpus);
     }
-    if(cpus_midr.empty())
+    if (cpus_midr.empty())
     {
         cpus_midr.resize(max_cpus, 0);
     }
@@ -333,7 +336,9 @@
     CpuInfo info(isa, cpus_model);
     return info;
 
-#elif(BARE_METAL) && defined(__aarch64__)        /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
+#elif (BARE_METAL) && \
+    defined(          \
+        __aarch64__) /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 
     // Assume single CPU in bare metal mode.  Just read the ID register and feature bits directly.
     uint64_t isar0 = 0, isar1 = 0, pfr0 = 0, pfr1 = 0, svefr0 = 0, midr = 0;
@@ -342,7 +347,7 @@
     ARM_COMPUTE_GET_FEATURE_REG(pfr0, ID_AA64PFR0_EL1);
     ARM_COMPUTE_GET_FEATURE_REG(pfr1, ID_AA64PFR1_EL1);
     ARM_COMPUTE_GET_FEATURE_REG(midr, MIDR_EL1);
-    if((pfr0 >> 32) & 0xf)
+    if ((pfr0 >> 32) & 0xf)
     {
         svefr0 = get_sve_feature_reg();
     }
@@ -361,14 +366,14 @@
     CpuInfo info(isainfo, cpus_model);
     return info;
 #else                                            /* #elif defined(__aarch64__) && defined(__APPLE__) */
-    CpuInfo info(CpuIsaInfo(), { CpuModel::GENERIC });
+    CpuInfo info(CpuIsaInfo(), {CpuModel::GENERIC});
     return info;
-#endif                                           /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
+#endif /* !defined(BARE_METAL) && !defined(__APPLE__) && !defined(__OpenBSD__) && (defined(__arm__) || defined(__aarch64__)) */
 }
 
 CpuModel CpuInfo::cpu_model(uint32_t cpuid) const
 {
-    if(cpuid < _cpus.size())
+    if (cpuid < _cpus.size())
     {
         return _cpus[cpuid];
     }
@@ -377,9 +382,10 @@
 
 CpuModel CpuInfo::cpu_model() const
 {
-#if defined(_WIN64) || defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__))
+#if defined(_WIN64) || defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || \
+    (!defined(__arm__) && !defined(__aarch64__))
     return cpu_model(0);
-#else  /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
+#else /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
     return cpu_model(sched_getcpu());
 #endif /* defined(BARE_METAL) || defined(__APPLE__) || defined(__OpenBSD__) || (!defined(__arm__) && !defined(__aarch64__)) */
 }
@@ -406,13 +412,13 @@
 
     // Read cpuinfo and get occurrence of each core
     std::ifstream cpuinfo_file("/proc/cpuinfo", std::ios::in);
-    if(cpuinfo_file.is_open())
+    if (cpuinfo_file.is_open())
     {
         std::string line;
-        while(bool(getline(cpuinfo_file, line)))
+        while (bool(getline(cpuinfo_file, line)))
         {
             std::array<regmatch_t, 2> match;
-            if(regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0) == 0)
+            if (regexec(&cpu_part_rgx, line.c_str(), 2, match.data(), 0) == 0)
             {
                 cpus.emplace_back(line.substr(match[1].rm_so, (match[1].rm_eo - match[1].rm_so)));
             }
@@ -425,13 +431,13 @@
     auto least_frequent_cpu_occurences = [](const std::vector<std::string> &cpus) -> uint32_t
     {
         std::unordered_map<std::string, uint32_t> cpus_freq;
-        for(const auto &cpu : cpus)
+        for (const auto &cpu : cpus)
         {
             cpus_freq[cpu]++;
         }
 
         uint32_t vmin = cpus.size() + 1;
-        for(const auto &cpu_freq : cpus_freq)
+        for (const auto &cpu_freq : cpus_freq)
         {
             vmin = std::min(vmin, cpu_freq.second);
         }

diff --git a/src/common/cpuinfo/CpuIsaInfo.cpp b/src/common/cpuinfo/CpuIsaInfo.cpp
index 23da54a..5977685 100644
--- a/src/common/cpuinfo/CpuIsaInfo.cpp
+++ b/src/common/cpuinfo/CpuIsaInfo.cpp

@@ -24,6 +24,7 @@
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/common/cpuinfo/CpuModel.h"
 
 /* Arm Feature flags */
@@ -31,18 +32,18 @@
 #define ARM_COMPUTE_CPU_FEATURE_HWCAP_NEON (1 << 12)
 
 /* Arm64 Feature flags */
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMD (1 << 1)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP (1 << 9)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP (1 << 10)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDDP (1 << 20)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE (1 << 22)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2 (1 << 1)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEI8MM (1 << 9)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMD     (1 << 1)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP      (1 << 9)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP   (1 << 10)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDDP   (1 << 20)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP_SVE       (1 << 22)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2     (1 << 1)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEI8MM  (1 << 9)
 #define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEF32MM (1 << 10)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16 (1 << 12)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_I8MM (1 << 13)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16 (1 << 14)
-#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME (1 << 23)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16  (1 << 12)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_I8MM     (1 << 13)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16     (1 << 14)
+#define ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME      (1 << 23)
 
 namespace arm_compute
 {
@@ -71,12 +72,12 @@
     isa.sve2 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVE2);
 
     // Detection of SME from type HWCAP2 in the auxillary vector
-    isa.sme   = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME);
-    isa.sme2  = isa.sme; // Needs to be set properly
+    isa.sme  = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SME);
+    isa.sme2 = isa.sme; // Needs to be set properly
 
     // Data-type support
-    isa.fp16    = is_feature_supported(hwcaps, ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP | ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP);
-    isa.bf16    = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16);
+    isa.fp16 = is_feature_supported(hwcaps, ARM_COMPUTE_CPU_FEATURE_HWCAP_FPHP | ARM_COMPUTE_CPU_FEATURE_HWCAP_ASIMDHP);
+    isa.bf16 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_BF16);
     isa.svebf16 = is_feature_supported(hwcaps2, ARM_COMPUTE_CPU_FEATURE_HWCAP2_SVEBF16);
 
     // Instruction extensions
@@ -92,12 +93,15 @@
 }
 #endif /* defined(__aarch64__) */
 
-void decode_regs(CpuIsaInfo &isa, const uint64_t isar0, const uint64_t isar1, const uint64_t pfr0, const uint64_t pfr1, const uint64_t svefr0)
+void decode_regs(CpuIsaInfo    &isa,
+                 const uint64_t isar0,
+                 const uint64_t isar1,
+                 const uint64_t pfr0,
+                 const uint64_t pfr1,
+                 const uint64_t svefr0)
 {
     auto is_supported = [](uint64_t feature_reg, uint8_t feature_pos) -> bool
-    {
-        return ((feature_reg >> feature_pos) & 0xf);
-    };
+    { return ((feature_reg >> feature_pos) & 0xf); };
 
     // High-level SIMD support
     isa.sve  = is_supported(pfr0, 32);
@@ -124,11 +128,11 @@
  */
 void allowlisted_model_features(CpuIsaInfo &isa, CpuModel model)
 {
-    if(isa.dot == false)
+    if (isa.dot == false)
     {
         isa.dot = model_supports_dot(model);
     }
-    if(isa.fp16 == false)
+    if (isa.fp16 == false)
     {
         isa.fp16 = model_supports_fp16(model);
     }
@@ -147,7 +151,8 @@
     return isa;
 }
 
-CpuIsaInfo init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr)
+CpuIsaInfo
+init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr)
 {
     CpuIsaInfo isa;
 

diff --git a/src/common/cpuinfo/CpuIsaInfo.h b/src/common/cpuinfo/CpuIsaInfo.h
index b92b653..9d6bc07 100644
--- a/src/common/cpuinfo/CpuIsaInfo.h
+++ b/src/common/cpuinfo/CpuIsaInfo.h

@@ -37,22 +37,22 @@
 struct CpuIsaInfo
 {
     /* SIMD extension support */
-    bool neon{ false };
-    bool sve{ false };
-    bool sve2{ false };
-    bool sme{ false };
-    bool sme2{ false };
+    bool neon{false};
+    bool sve{false};
+    bool sve2{false};
+    bool sme{false};
+    bool sme2{false};
 
     /* Data-type extensions support */
-    bool fp16{ false };
-    bool bf16{ false };
-    bool svebf16{ false };
+    bool fp16{false};
+    bool bf16{false};
+    bool svebf16{false};
 
     /* Instruction support */
-    bool dot{ false };
-    bool i8mm{ false };
-    bool svei8mm{ false };
-    bool svef32mm{ false };
+    bool dot{false};
+    bool i8mm{false};
+    bool svei8mm{false};
+    bool svef32mm{false};
 };
 
 /** Identify ISA related information through system information
@@ -76,7 +76,8 @@
  *
  * @return CpuIsaInfo A populated ISA feature structure
  */
-CpuIsaInfo init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr);
+CpuIsaInfo
+init_cpu_isa_from_regs(uint64_t isar0, uint64_t isar1, uint64_t pfr0, uint64_t pfr1, uint64_t svefr0, uint64_t midr);
 } // namespace cpuinfo
 } // namespace arm_compute
 

diff --git a/src/common/cpuinfo/CpuModel.cpp b/src/common/cpuinfo/CpuModel.cpp
index d6d91df..0455670 100644
--- a/src/common/cpuinfo/CpuModel.cpp
+++ b/src/common/cpuinfo/CpuModel.cpp

@@ -29,12 +29,12 @@
 {
 std::string cpu_model_to_string(CpuModel model)
 {
-    switch(model)
+    switch (model)
     {
 #define X(MODEL)          \
-case CpuModel::MODEL: \
-    return #MODEL;
-            ARM_COMPUTE_CPU_MODEL_LIST
+    case CpuModel::MODEL: \
+        return #MODEL;
+        ARM_COMPUTE_CPU_MODEL_LIST
 #undef X
         default:
         {
@@ -45,7 +45,7 @@
 
 bool model_supports_fp16(CpuModel model)
 {
-    switch(model)
+    switch (model)
     {
         case CpuModel::GENERIC_FP16:
         case CpuModel::GENERIC_FP16_DOT:
@@ -63,7 +63,7 @@
 
 bool model_supports_dot(CpuModel model)
 {
-    switch(model)
+    switch (model)
     {
         case CpuModel::GENERIC_FP16_DOT:
         case CpuModel::A55r1:
@@ -87,16 +87,16 @@
     const int cpunum      = (midr >> 4) & 0xFFF;
 
     // Only CPUs we have code paths for are detected.  All other CPUs can be safely classed as "GENERIC"
-    if(implementer == 0x41) // Arm CPUs
+    if (implementer == 0x41) // Arm CPUs
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0xd03: // A53
             case 0xd04: // A35
                 model = CpuModel::A53;
                 break;
             case 0xd05: // A55
-                if(variant != 0)
+                if (variant != 0)
                 {
                     model = CpuModel::A55r1;
                 }
@@ -109,7 +109,7 @@
                 model = CpuModel::A73;
                 break;
             case 0xd0a: // A75
-                if(variant != 0)
+                if (variant != 0)
                 {
                     model = CpuModel::GENERIC_FP16_DOT;
                 }
@@ -144,9 +144,9 @@
                 break;
         }
     }
-    else if(implementer == 0x46)
+    else if (implementer == 0x46)
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0x001: // A64FX
                 model = CpuModel::A64FX;
@@ -156,9 +156,9 @@
                 break;
         }
     }
-    else if(implementer == 0x48)
+    else if (implementer == 0x48)
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0xd40: // A76
                 model = CpuModel::GENERIC_FP16_DOT;
@@ -168,9 +168,9 @@
                 break;
         }
     }
-    else if(implementer == 0x51)
+    else if (implementer == 0x51)
     {
-        switch(cpunum)
+        switch (cpunum)
         {
             case 0x800: // A73
                 model = CpuModel::A73;
@@ -196,4 +196,4 @@
     return model;
 }
 } // namespace cpuinfo
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/common/cpuinfo/CpuModel.h b/src/common/cpuinfo/CpuModel.h
index 4fe6c29..3b9d9e3 100644
--- a/src/common/cpuinfo/CpuModel.h
+++ b/src/common/cpuinfo/CpuModel.h

@@ -24,11 +24,11 @@
 #ifndef SRC_COMMON_CPUINFO_CPUMODEL_H
 #define SRC_COMMON_CPUINFO_CPUMODEL_H
 
+#include "arm_compute/core/CPP/CPPTypes.h"
+
 #include <cstdint>
 #include <string>
 
-#include "arm_compute/core/CPP/CPPTypes.h"
-
 namespace arm_compute
 {
 namespace cpuinfo

diff --git a/src/common/utils/LegacySupport.cpp b/src/common/utils/LegacySupport.cpp
index 06b1693..1026442 100644
--- a/src/common/utils/LegacySupport.cpp
+++ b/src/common/utils/LegacySupport.cpp

@@ -33,7 +33,7 @@
 {
 DataType convert_to_legacy_data_type(AclDataType data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case AclDataType::AclFloat32:
             return DataType::F32;
@@ -48,7 +48,7 @@
 
 AclDataType convert_to_c_data_type(DataType data_type)
 {
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F32:
             return AclDataType::AclFloat32;
@@ -64,7 +64,7 @@
 TensorShape create_legacy_tensor_shape(int32_t ndims, int32_t *shape)
 {
     TensorShape legacy_shape{};
-    for(int32_t d = 0; d < ndims; ++d)
+    for (int32_t d = 0; d < ndims; ++d)
     {
         legacy_shape.set(d, shape[d], false);
     }
@@ -73,14 +73,14 @@
 int32_t *create_tensor_shape_array(const TensorInfo &info)
 {
     const auto num_dims = info.num_dimensions();
-    if(num_dims <= 0)
+    if (num_dims <= 0)
     {
         return nullptr;
     }
 
     int32_t *shape_array = new int32_t[num_dims];
 
-    for(size_t d = 0; d < num_dims; ++d)
+    for (size_t d = 0; d < num_dims; ++d)
     {
         shape_array[d] = info.tensor_shape()[d];
     }
@@ -92,28 +92,23 @@
 TensorInfo convert_to_legacy_tensor_info(const AclTensorDescriptor &desc)
 {
     TensorInfo legacy_desc;
-    legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1, convert_to_legacy_data_type(desc.data_type));
+    legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1,
+                     convert_to_legacy_data_type(desc.data_type));
     return legacy_desc;
 }
 
 AclTensorDescriptor convert_to_descriptor(const TensorInfo &info)
 {
     const auto          num_dims = info.num_dimensions();
-    AclTensorDescriptor desc
-    {
-        static_cast<int32_t>(num_dims),
-        create_tensor_shape_array(info),
-        convert_to_c_data_type(info.data_type()),
-        nullptr,
-        0
-    };
+    AclTensorDescriptor desc{static_cast<int32_t>(num_dims), create_tensor_shape_array(info),
+                             convert_to_c_data_type(info.data_type()), nullptr, 0};
     return desc;
 }
 
 ActivationLayerInfo convert_to_activation_info(const AclActivationDescriptor &desc)
 {
     ActivationLayerInfo::ActivationFunction act;
-    switch(desc.type)
+    switch (desc.type)
     {
         case AclActivationType::AclIdentity:
             act = ActivationLayerInfo::ActivationFunction::IDENTITY;

diff --git a/src/common/utils/Log.h b/src/common/utils/Log.h
index bbfe1ce..6ebfed3 100644
--- a/src/common/utils/Log.h
+++ b/src/common/utils/Log.h

@@ -38,20 +38,22 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/logging/Macros.h"
+
 #include "utils/TypePrinter.h"
 
 /** Create a logger
  *
  * @note It will eventually create all default loggers in don't exist
  */
-#define ARM_COMPUTE_CREATE_ACL_LOGGER()                                                                                        \
-    do                                                                                                                         \
-    {                                                                                                                          \
-        if(arm_compute::logging::LoggerRegistry::get().logger("ComputeLibrary") == nullptr)                                    \
-        {                                                                                                                      \
-            arm_compute::logging::LoggerRegistry::get().create_logger("ComputeLibrary", arm_compute::logging::LogLevel::INFO); \
-        }                                                                                                                      \
-    } while(false)
+#define ARM_COMPUTE_CREATE_ACL_LOGGER()                                                                      \
+    do                                                                                                       \
+    {                                                                                                        \
+        if (arm_compute::logging::LoggerRegistry::get().logger("ComputeLibrary") == nullptr)                 \
+        {                                                                                                    \
+            arm_compute::logging::LoggerRegistry::get().create_logger("ComputeLibrary",                      \
+                                                                      arm_compute::logging::LogLevel::INFO); \
+        }                                                                                                    \
+    } while (false)
 
 /** Log a message to the logger
  *
@@ -63,7 +65,7 @@
     {                                                          \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                       \
         ARM_COMPUTE_LOG_MSG("ComputeLibrary", log_level, msg); \
-    } while(false)
+    } while (false)
 
 /** Log a message with format to the logger
  *
@@ -76,7 +78,7 @@
     {                                                                                   \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                \
         ARM_COMPUTE_LOG_MSG_WITH_FORMAT("ComputeLibrary", log_level, fmt, __VA_ARGS__); \
-    } while(false)
+    } while (false)
 
 /** Log an error message to the logger
  *
@@ -87,7 +89,7 @@
     {                                                                                      \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                   \
         ARM_COMPUTE_LOG_MSG("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \
-    } while(false)
+    } while (false)
 
 /** Log an error message to the logger with function name before the message
  *
@@ -98,7 +100,7 @@
     {                                                                                                    \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                                 \
         ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::ERROR, msg); \
-    } while(false)
+    } while (false)
 
 /** Log an information message to the logger with function name before the message
  *
@@ -109,7 +111,7 @@
     {                                                                                                   \
         ARM_COMPUTE_CREATE_ACL_LOGGER();                                                                \
         ARM_COMPUTE_LOG_MSG_WITH_FUNCNAME("ComputeLibrary", arm_compute::logging::LogLevel::INFO, msg); \
-    } while(false)
+    } while (false)
 
 /** Function template specialization for the out of bound element at index = tuple_size
  *
@@ -131,12 +133,13 @@
  * @param[in]     in_params_tuple Constant reference to a tuple of different input data types
  */
 template <std::size_t Index, typename... Tp>
-inline typename std::enable_if < Index<sizeof...(Tp), void>::type
-logParamsImpl(std::vector<std::string> &data_registry, const std::tuple<Tp...> &in_params_tuple)
+    inline typename std::enable_if <
+    Index<sizeof...(Tp), void>::type logParamsImpl(std::vector<std::string> &data_registry,
+                                                   const std::tuple<Tp...>  &in_params_tuple)
 {
     data_registry.push_back(arm_compute::to_string(std::get<Index>(in_params_tuple)));
     // Unfold the next tuple element
-    logParamsImpl < Index + 1, Tp... > (data_registry, in_params_tuple);
+    logParamsImpl<Index + 1, Tp...>(data_registry, in_params_tuple);
 }
 
 /** Function Template with variable number of inputs to collect all the passed parameters from
@@ -149,10 +152,10 @@
  * @return  Vector of the parameters' data in a string format
  */
 template <typename... Ts>
-const std::vector<std::string> logParams(Ts &&... ins)
+const std::vector<std::string> logParams(Ts &&...ins)
 {
     std::vector<std::string> data_registry{};
-    std::tuple<Ts...>        in_params_tuple{ ins... };
+    std::tuple<Ts...>        in_params_tuple{ins...};
 
     // Start logging the tuple elements, starting from 0 to tuple_size-1
     logParamsImpl<0>(data_registry, in_params_tuple);
@@ -178,11 +181,11 @@
 
     // Usually the input parameters string would be name of parameters separated
     // by ',' e.g. "src0, src1, policy"
-    while(std::getline(ss, temp, ','))
+    while (std::getline(ss, temp, ','))
     {
         names.push_back(temp);
     }
-    for(auto &name : names)
+    for (auto &name : names)
     {
         // Totally get rid of white space characters
         name.erase(std::remove(name.begin(), name.end(), ' '), name.end());
@@ -205,7 +208,7 @@
 {
     std::string dataLog = "\n ";
     ARM_COMPUTE_ERROR_ON(params_names.size() != data_registry.size());
-    for(uint8_t i = 0; i < params_names.size(); ++i)
+    for (uint8_t i = 0; i < params_names.size(); ++i)
     {
         dataLog += params_names[i] + ": " + data_registry.at(i) + "\n ";
     }
@@ -220,11 +223,11 @@
  *
  * @param[in] ... Input parameters
  */
-#define ARM_COMPUTE_LOG_PARAMS(...)                                                           \
-    do                                                                                        \
-    {                                                                                         \
-        ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(constructDataLog(getParamsNames(#__VA_ARGS__), \
-                                                                logParams(__VA_ARGS__)));     \
-    } while(false)
+#define ARM_COMPUTE_LOG_PARAMS(...)                                                  \
+    do                                                                               \
+    {                                                                                \
+        ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL(                                      \
+            constructDataLog(getParamsNames(#__VA_ARGS__), logParams(__VA_ARGS__))); \
+    } while (false)
 #endif /* ARM_COMPUTE_LOGGING_ENABLED */
 #endif /* SRC_COMMON_LOG_H */

diff --git a/src/common/utils/Macros.h b/src/common/utils/Macros.h
index 2e44ea5..35f7e75 100644
--- a/src/common/utils/Macros.h
+++ b/src/common/utils/Macros.h

@@ -28,7 +28,7 @@
 
 #define ARM_COMPUTE_RETURN_CENUM_ON_FAILURE(status)                 \
     {                                                               \
-        if(status != arm_compute::StatusCode::Success)              \
+        if (status != arm_compute::StatusCode::Success)             \
         {                                                           \
             return arm_compute::utils::as_cenum<AclStatus>(status); \
         }                                                           \

diff --git a/src/common/utils/Object.h b/src/common/utils/Object.h
index 1f19473..b73de8e 100644
--- a/src/common/utils/Object.h
+++ b/src/common/utils/Object.h

@@ -52,14 +52,12 @@
      * @param[in] type_ Object identification type
      * @param[in] ctx_  Context to reference
      */
-    Header(ObjectType type_, IContext *ctx_) noexcept
-        : type(type_),
-          ctx(ctx_)
+    Header(ObjectType type_, IContext *ctx_) noexcept : type(type_), ctx(ctx_)
     {
     }
 
-    ObjectType type{ ObjectType::Invalid };
-    IContext *ctx{ nullptr };
+    ObjectType type{ObjectType::Invalid};
+    IContext  *ctx{nullptr};
 };
 } // namespace detail
 } // namespace arm_compute

diff --git a/src/common/utils/Utils.h b/src/common/utils/Utils.h
index 1bd1c7e..33fe6c0 100644
--- a/src/common/utils/Utils.h
+++ b/src/common/utils/Utils.h

@@ -74,10 +74,7 @@
 template <typename E>
 bool is_in(E check, std::initializer_list<E> list)
 {
-    return std::any_of(list.begin(), list.end(), [&check](E e)
-    {
-        return check == e;
-    });
+    return std::any_of(list.begin(), list.end(), [&check](E e) { return check == e; });
 }
 } // namespace utils
 } // namespace arm_compute

diff --git a/src/common/utils/Validate.h b/src/common/utils/Validate.h
index 4e88072..97819c6 100644
--- a/src/common/utils/Validate.h
+++ b/src/common/utils/Validate.h

@@ -29,7 +29,7 @@
 
 #include <cassert>
 
-#define ARM_COMPUTE_ASSERT(cond) assert(cond)
+#define ARM_COMPUTE_ASSERT(cond)            assert(cond)
 #define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr) assert((ptr) != nullptr)
 
 #else /* defined(ARM_COMPUTE_ASSERTS_ENABLED) */

diff --git a/src/core/AccessWindowAutoPadding.cpp b/src/core/AccessWindowAutoPadding.cpp
index ca2f7d2..52be699 100644
--- a/src/core/AccessWindowAutoPadding.cpp
+++ b/src/core/AccessWindowAutoPadding.cpp

@@ -28,12 +28,14 @@
 
 using namespace arm_compute;
 
-AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info)
-    : _info(info)
+AccessWindowAutoPadding::AccessWindowAutoPadding(ITensorInfo *info) : _info(info)
 {
 }
 
-ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowAutoPadding::compute_valid_region(const Window &window,
+                                                          ValidRegion   input_valid_region,
+                                                          bool          border_undefined,
+                                                          BorderSize    border_size) const
 {
     ARM_COMPUTE_UNUSED(window);
     ARM_COMPUTE_UNUSED(input_valid_region);
@@ -45,17 +47,17 @@
 
 ValidRegion AccessWindowAutoPadding::compute_valid_region() const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return ValidRegion{};
     }
 
-    return ValidRegion{ Coordinates(), _info->tensor_shape() };
+    return ValidRegion{Coordinates(), _info->tensor_shape()};
 }
 
 void AccessWindowAutoPadding::set_valid_region()
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return;
     }
@@ -75,7 +77,7 @@
     ARM_COMPUTE_UNUSED(window);
 
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }

diff --git a/src/core/AccessWindowAutoPadding.h b/src/core/AccessWindowAutoPadding.h
index b8d1508..406bdba 100644
--- a/src/core/AccessWindowAutoPadding.h
+++ b/src/core/AccessWindowAutoPadding.h

@@ -74,9 +74,12 @@
     ValidRegion compute_valid_region() const;
 
     // Inherited methods overridden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    bool        update_window_if_needed(Window &window) const override;
+    bool        update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
 private:
     ITensorInfo *_info;

diff --git a/src/core/AccessWindowStatic.cpp b/src/core/AccessWindowStatic.cpp
index 0607011..98182b1 100644
--- a/src/core/AccessWindowStatic.cpp
+++ b/src/core/AccessWindowStatic.cpp

@@ -34,7 +34,10 @@
 {
 }
 
-ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowStatic::compute_valid_region(const Window &window,
+                                                     ValidRegion   input_valid_region,
+                                                     bool          border_undefined,
+                                                     BorderSize    border_size) const
 {
     ARM_COMPUTE_UNUSED(border_undefined);
     ARM_COMPUTE_UNUSED(border_size);
@@ -44,7 +47,7 @@
 
 ValidRegion AccessWindowStatic::compute_valid_region(const Window &window, ValidRegion input_valid_region) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -57,7 +60,7 @@
     // Start of the valid region is equal to the start of the static access but
     // never outside of the tensor.
     anchor.set(0, std::max<int>(0, _start_x));
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         anchor.set(1, std::max<int>(0, _start_y));
     }
@@ -65,7 +68,7 @@
     // End of the valid region is equal to the end of the static access but
     // never outside of the tensor.
     shape.set(0, std::min<int>(_end_x, _info->tensor_shape()[0]));
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         shape.set(1, std::min<int>(_end_y, _info->tensor_shape()[1]));
     }
@@ -75,7 +78,7 @@
 
 void AccessWindowStatic::set_valid_region(const Window &window, const ValidRegion &input_valid_region)
 {
-    if(_info != nullptr)
+    if (_info != nullptr)
     {
         _info->set_valid_region(compute_valid_region(window, input_valid_region));
     }
@@ -84,7 +87,7 @@
 bool AccessWindowStatic::update_window_if_needed(Window &window) const
 {
     // If the padding is not enough and the tensor is not resizable, shrink the window to size 0
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
@@ -96,48 +99,50 @@
     bool window_modified = false;
 
     // Calculate if padding is enough
-    if(_start_y < 0)
+    if (_start_y < 0)
     {
         const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
 
-        if(_start_y < front_pad_y_available)
+        if (_start_y < front_pad_y_available)
         {
             window_modified = true;
         }
     }
 
-    if(!window_modified)
+    if (!window_modified)
     {
-        if(_end_y > static_cast<int>(shape[1]))
+        if (_end_y > static_cast<int>(shape[1]))
         {
             const int stride_z             = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
             const int tail_pad_y_available = (stride_z / strides[1]) - shape[1];
 
-            if(static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
+            if (static_cast<int>(shape[1]) + tail_pad_y_available < _end_y)
             {
                 window_modified = true;
             }
         }
 
-        if(!window_modified)
+        if (!window_modified)
         {
             const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
-            if(_start_x < 0)
+            if (_start_x < 0)
             {
-                const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+                const int front_pad_x_available =
+                    -std::min<int>(static_cast<int>(offset_first_element), stride_y - shape[0] * strides[0]) /
+                    static_cast<int>(strides[0]);
 
-                if(_start_x < front_pad_x_available)
+                if (_start_x < front_pad_x_available)
                 {
                     window_modified = true;
                 }
             }
 
-            if(!window_modified && _end_x > static_cast<int>(shape[0]))
+            if (!window_modified && _end_x > static_cast<int>(shape[0]))
             {
                 const int tail_pad_x_available = (stride_y / strides[0]) - shape[0];
 
-                if(static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
+                if (static_cast<int>(shape[0]) + tail_pad_x_available < _end_x)
                 {
                     window_modified = true;
                 }
@@ -146,9 +151,9 @@
     }
 
     // If padding is not enough
-    if(window_modified)
+    if (window_modified)
     {
-        for(size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
+        for (size_t i = 0; i < Coordinates::num_max_dimensions; ++i)
         {
             window.set(i, Window::Dimension(0, 0, 1));
         }
@@ -162,7 +167,7 @@
     ARM_COMPUTE_UNUSED(window);
 
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }

diff --git a/src/core/AccessWindowStatic.h b/src/core/AccessWindowStatic.h
index f7d43cb..5c6d2c7 100644
--- a/src/core/AccessWindowStatic.h
+++ b/src/core/AccessWindowStatic.h

@@ -86,9 +86,12 @@
     ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region) const;
 
     // Inherited methods overriden:
-    bool update_window_if_needed(Window &window) const override;
-    bool update_padding_if_needed(const Window &window) override;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    bool        update_window_if_needed(Window &window) const override;
+    bool        update_padding_if_needed(const Window &window) override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 
 private:
     ITensorInfo *_info;

diff --git a/src/core/AccessWindowTranspose.cpp b/src/core/AccessWindowTranspose.cpp
index d8bd4c4..42f0081 100644
--- a/src/core/AccessWindowTranspose.cpp
+++ b/src/core/AccessWindowTranspose.cpp

@@ -29,9 +29,12 @@
 
 using namespace arm_compute;
 
-ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowTranspose::compute_valid_region(const Window &window,
+                                                        ValidRegion   input_valid_region,
+                                                        bool          border_undefined,
+                                                        BorderSize    border_size) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -41,7 +44,7 @@
     Coordinates  old_anchor(anchor);
     TensorShape  old_shape(shape);
 
-    if(!border_undefined)
+    if (!border_undefined)
     {
         border_size = BorderSize(0);
     }
@@ -53,7 +56,7 @@
     // the kernel to write back output values.
     // As the relation between input and output is transposed window.y() is
     // used for x anchor and window.x() for y anchor.
-    if(_info->dimension(0) > 1)
+    if (_info->dimension(0) > 1)
     {
         anchor.set(0, std::max<int>(window.y().start() * _scale_x, anchor[1] + border_size.top) + _x);
     }
@@ -69,15 +72,19 @@
     // a size of the region.
     // As the relation between input and output is transposed window.y() is
     // used for x shape and window.x() for y shape.
-    if(_info->dimension(0) > 1)
+    if (_info->dimension(0) > 1)
     {
-        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right, (window.y().end() - window.y().step()) * _scale_x + _width) - anchor[0]);
+        shape.set(0, std::min<int>((old_anchor[1] + old_shape[0]) * _scale_x - border_size.right,
+                                   (window.y().end() - window.y().step()) * _scale_x + _width) -
+                         anchor[0]);
     }
-    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom, (window.x().end() - window.x().step()) * _scale_y + _height) - anchor[1]);
+    shape.set(1, std::min<int>((old_anchor[0] + old_shape[1]) * _scale_y - border_size.bottom,
+                               (window.x().end() - window.x().step()) * _scale_y + _height) -
+                     anchor[1]);
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
-    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    for (size_t d = 2; d < _info->num_dimensions(); ++d)
     {
         anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
         shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -89,7 +96,7 @@
 bool AccessWindowTranspose::update_window_if_needed(Window &window) const
 {
     // Only update the window size if we can't use padding
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
@@ -107,12 +114,12 @@
     const int max_y = window.x().end() * _scale_y + _y;
 
     // Adjust window start for output's Y dimension (so X in (input) window)
-    if(min_y < 0)
+    if (min_y < 0)
     {
         // Calculate rows available above the tensor
         const int front_pad_y_available = -offset_first_element / strides[1];
 
-        if(min_y < front_pad_y_available)
+        if (min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
             const int start = adjust_up(min_y, front_pad_y_available, window.x().step() * _scale_y) - _y;
@@ -126,17 +133,18 @@
     }
 
     // Adjust window end for Y dimension
-    if(max_y > static_cast<int>(shape[1]))
+    if (max_y > static_cast<int>(shape[1]))
     {
         const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
 
         // Calculate rows available below the tensor
         const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
 
-        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) + window.x().step() * _scale_y - _y - _height;
+            const int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.x().step() * _scale_y) +
+                            window.x().step() * _scale_y - _y - _height;
             window.set(0, Window::Dimension(window.x().start(), end / _scale_y, window.x().step()));
             window_modified = true;
         }
@@ -151,11 +159,14 @@
     const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
     // Adjust window start for X dimension
-    if(min_x < 0)
+    if (min_x < 0)
     {
-        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+        const int front_pad_x_available =
+            -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+                           stride_y - shape[0] * strides[0]) /
+            static_cast<int>(strides[0]);
 
-        if(min_x < front_pad_x_available)
+        if (min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
             const int start = adjust_up(min_x, front_pad_x_available, window.y().step() * _scale_x) - _x;
@@ -168,14 +179,15 @@
     }
 
     // Adjust window end for X dimension
-    if(max_x > static_cast<int>(shape[0]))
+    if (max_x > static_cast<int>(shape[0]))
     {
         const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
 
-        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) + window.y().step() * _scale_x - _x - _width;
+            const int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.y().step() * _scale_x) +
+                            window.y().step() * _scale_x - _x - _width;
             window.set(1, Window::Dimension(window.y().start(), end / _scale_x, window.y().step()));
             window_modified = true;
         }
@@ -189,7 +201,7 @@
 bool AccessWindowTranspose::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }

diff --git a/src/core/AccessWindowTranspose.h b/src/core/AccessWindowTranspose.h
index 0306076..12bb9a5 100644
--- a/src/core/AccessWindowTranspose.h
+++ b/src/core/AccessWindowTranspose.h

@@ -42,7 +42,10 @@
     bool update_window_if_needed(Window &window) const override;
     bool update_padding_if_needed(const Window &window) override;
     using AccessWindowRectangle::compute_valid_region;
-    ValidRegion compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const override;
+    ValidRegion compute_valid_region(const Window &window,
+                                     ValidRegion   input_valid_region,
+                                     bool          border_undefined,
+                                     BorderSize    border_size) const override;
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_IACCESS_WINDOW_TRANSPOSE_H*/

diff --git a/src/core/CL/CLCommandBuffer.cpp b/src/core/CL/CLCommandBuffer.cpp
index 7fcfdf2..d094dcd 100644
--- a/src/core/CL/CLCommandBuffer.cpp
+++ b/src/core/CL/CLCommandBuffer.cpp

@@ -38,7 +38,7 @@
     const auto &cl_device            = CLKernelLibrary::get().get_device();
     const auto  has_mutable_dispatch = command_buffer_mutable_dispatch_supported(cl_device);
 
-    if(has_mutable_dispatch)
+    if (has_mutable_dispatch)
     {
         return std::make_unique<CLMutableCommandBuffer>(queue);
     }

diff --git a/src/core/CL/CLCommandBuffer.h b/src/core/CL/CLCommandBuffer.h
index 8a94e38..90e4341 100644
--- a/src/core/CL/CLCommandBuffer.h
+++ b/src/core/CL/CLCommandBuffer.h

@@ -87,7 +87,8 @@
      * @param[in] global The global work size.
      * @param[in] local  The local work size.
      */
-    virtual void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0;
+    virtual void
+    add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) = 0;
 
     /** Add the mutable argument to the current kernel enqueue command.
      *
@@ -154,7 +155,7 @@
     CLCommandBuffer &state(State state);
 
 private:
-    State _state{ State::Created };
+    State _state{State::Created};
 };
 
 } // namespace arm_compute

diff --git a/src/core/CL/CLCompatCommandBuffer.cpp b/src/core/CL/CLCompatCommandBuffer.cpp
index f1a902c..242fd77 100644
--- a/src/core/CL/CLCompatCommandBuffer.cpp
+++ b/src/core/CL/CLCompatCommandBuffer.cpp

@@ -31,8 +31,7 @@
 namespace arm_compute
 {
 
-CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue)
-    : _queue(queue)
+CLCompatCommandBuffer::CLCompatCommandBuffer(cl_command_queue queue) : _queue(queue)
 {
 }
 
@@ -40,11 +39,14 @@
 {
 }
 
-void CLCompatCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local)
+void CLCompatCommandBuffer::add_kernel(cl_kernel          kernel,
+                                       const cl::NDRange &offset,
+                                       const cl::NDRange &global,
+                                       const cl::NDRange &local)
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Created);
 
-    _kernel_cmds.push_back(KernelCommand{ kernel, offset, global, local, {} });
+    _kernel_cmds.push_back(KernelCommand{kernel, offset, global, local, {}});
 }
 
 void CLCompatCommandBuffer::add_mutable_argument_generic(cl_uint arg_idx, const void *value, size_t size)
@@ -52,7 +54,7 @@
     ARM_COMPUTE_ERROR_ON(state() != State::Created);
     ARM_COMPUTE_ERROR_ON(_kernel_cmds.empty());
 
-    _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{ arg_idx, size, value });
+    _kernel_cmds.back().mutable_args.push_back(cl_mutable_dispatch_arg_khr{arg_idx, size, value});
 }
 
 void CLCompatCommandBuffer::finalize()
@@ -61,7 +63,7 @@
 
     _kernel_cmds.shrink_to_fit();
 
-    for(auto &cmd : _kernel_cmds)
+    for (auto &cmd : _kernel_cmds)
     {
         cmd.mutable_args.shrink_to_fit();
     }
@@ -80,25 +82,19 @@
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
 
-    for(const auto &cmd : _kernel_cmds)
+    for (const auto &cmd : _kernel_cmds)
     {
-        for(const auto &arg : cmd.mutable_args)
+        for (const auto &arg : cmd.mutable_args)
         {
             const auto error = clSetKernelArg(cmd.kernel, arg.arg_index, arg.arg_size, arg.arg_value);
 
             handle_cl_error("clSetKernelArg", error);
         }
 
-        const auto error = clEnqueueNDRangeKernel(
-            _queue,
-            cmd.kernel,
-            static_cast<cl_uint>(cmd.global.dimensions()),
-            cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr,
-            cmd.global.get(),
-            cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr,
-            0,
-            nullptr,
-            nullptr);
+        const auto error =
+            clEnqueueNDRangeKernel(_queue, cmd.kernel, static_cast<cl_uint>(cmd.global.dimensions()),
+                                   cmd.offset.dimensions() != 0 ? cmd.offset.get() : nullptr, cmd.global.get(),
+                                   cmd.local.dimensions() != 0 ? cmd.local.get() : nullptr, 0, nullptr, nullptr);
 
         handle_cl_error("clEnqueueNDRangeKernel", error);
     }

diff --git a/src/core/CL/CLCompatCommandBuffer.h b/src/core/CL/CLCompatCommandBuffer.h
index e91d52d..d5df106 100644
--- a/src/core/CL/CLCompatCommandBuffer.h
+++ b/src/core/CL/CLCompatCommandBuffer.h

@@ -57,7 +57,10 @@
     /** Disallow move assignment. */
     CLCompatCommandBuffer &operator=(CLCompatCommandBuffer &&) = delete;
 
-    void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override;
+    void add_kernel(cl_kernel          kernel,
+                    const cl::NDRange &offset,
+                    const cl::NDRange &global,
+                    const cl::NDRange &local) override;
 
     void finalize() override;
 

diff --git a/src/core/CL/CLCompileContext.cpp b/src/core/CL/CLCompileContext.cpp
index 2d024f9..9bbc326 100644
--- a/src/core/CL/CLCompileContext.cpp
+++ b/src/core/CL/CLCompileContext.cpp

@@ -22,19 +22,19 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/CL/OpenCL.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
+
 #include "support/StringSupport.h"
 
 #include <regex>
 
 namespace arm_compute
 {
-CLBuildOptions::CLBuildOptions()
-    : _build_opts()
+CLBuildOptions::CLBuildOptions() : _build_opts()
 {
 }
 
@@ -45,7 +45,7 @@
 
 void CLBuildOptions::add_option_if(bool cond, std::string option)
 {
-    if(cond)
+    if (cond)
     {
         add_option(std::move(option));
     }
@@ -63,7 +63,7 @@
 
 void CLBuildOptions::add_options_if(bool cond, const StringSet &options)
 {
-    if(cond)
+    if (cond)
     {
         add_options(options);
     }
@@ -79,26 +79,35 @@
     return _build_opts == other._build_opts;
 }
 
-Program::Program()
-    : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
+Program::Program() : _context(), _device(), _is_binary(false), _name(), _source(), _binary()
 {
 }
 
 Program::Program(cl::Context context, std::string name, std::string source)
-    : _context(std::move(context)), _device(), _is_binary(false), _name(std::move(name)), _source(std::move(source)), _binary()
+    : _context(std::move(context)),
+      _device(),
+      _is_binary(false),
+      _name(std::move(name)),
+      _source(std::move(source)),
+      _binary()
 {
 }
 
 Program::Program(cl::Context context, cl::Device device, std::string name, std::vector<unsigned char> binary)
-    : _context(std::move(context)), _device(std::move(device)), _is_binary(true), _name(std::move(name)), _source(), _binary(std::move(binary))
+    : _context(std::move(context)),
+      _device(std::move(device)),
+      _is_binary(true),
+      _name(std::move(name)),
+      _source(),
+      _binary(std::move(binary))
 {
 }
 
 Program::operator cl::Program() const
 {
-    if(_is_binary)
+    if (_is_binary)
     {
-        return cl::Program(_context, { _device }, { _binary });
+        return cl::Program(_context, {_device}, {_binary});
     }
     else
     {
@@ -112,12 +121,12 @@
     {
         return program.build(build_options.c_str()) == CL_SUCCESS;
     }
-    catch(const cl::Error &e)
+    catch (const cl::Error &e)
     {
         cl_int     err        = CL_SUCCESS;
         const auto build_info = program.getBuildInfo<CL_PROGRAM_BUILD_LOG>(&err);
 
-        for(auto &pair : build_info)
+        for (auto &pair : build_info)
         {
             std::cerr << pair.second << std::endl;
         }
@@ -133,14 +142,12 @@
     return cl_program;
 }
 
-Kernel::Kernel()
-    : _name(), _kernel()
+Kernel::Kernel() : _name(), _kernel()
 {
 }
 
 Kernel::Kernel(std::string name, const cl::Program &program)
-    : _name(std::move(name)),
-      _kernel(cl::Kernel(program, _name.c_str()))
+    : _name(std::move(name)), _kernel(cl::Kernel(program, _name.c_str()))
 {
 }
 CLCompileContext::CLCompileContext()
@@ -156,15 +163,19 @@
     _is_wbsm_supported = get_wbsm_support_info(device);
 }
 
-Kernel CLCompileContext::create_kernel(const std::string &kernel_name, const std::string &program_name, const std::string &program_source,
-                                       const std::string &kernel_path, const StringSet &build_options_set, bool is_binary) const
+Kernel CLCompileContext::create_kernel(const std::string &kernel_name,
+                                       const std::string &program_name,
+                                       const std::string &program_source,
+                                       const std::string &kernel_path,
+                                       const StringSet   &build_options_set,
+                                       bool               is_binary) const
 {
     const std::string build_options      = generate_build_options(build_options_set, kernel_path);
     const std::string built_program_name = program_name + "_" + build_options;
     auto              built_program_it   = _built_programs_map.find(built_program_name);
     cl::Program       cl_program;
 
-    if(_built_programs_map.end() != built_program_it)
+    if (_built_programs_map.end() != built_program_it)
     {
         // If program has been built, retrieve to create kernel from it
         cl_program = built_program_it->second;
@@ -184,11 +195,12 @@
     return Kernel(kernel_name, cl_program);
 }
 
-const Program &CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
+const Program &
+CLCompileContext::load_program(const std::string &program_name, const std::string &program_source, bool is_binary) const
 {
     const auto program_it = _programs_map.find(program_name);
 
-    if(program_it != _programs_map.end())
+    if (program_it != _programs_map.end())
     {
         return program_it->second;
     }
@@ -199,9 +211,10 @@
     ARM_COMPUTE_UNUSED(is_binary);
     program = Program(_context, program_name, program_source);
 #else  /* EMBEDDED_KERNELS */
-    if(is_binary)
+    if (is_binary)
     {
-        program = Program(_context, _device.cl_device(), program_name, std::vector<unsigned char>(program_source.begin(), program_source.end()));
+        program = Program(_context, _device.cl_device(), program_name,
+                          std::vector<unsigned char>(program_source.begin(), program_source.end()));
     }
     else
     {
@@ -218,18 +231,19 @@
 void CLCompileContext::set_context(cl::Context context)
 {
     _context = std::move(context);
-    if(_context.get() != nullptr)
+    if (_context.get() != nullptr)
     {
         const auto cl_devices = _context.getInfo<CL_CONTEXT_DEVICES>();
 
-        if(!cl_devices.empty())
+        if (!cl_devices.empty())
         {
             _device = CLDevice(cl_devices[0]);
         }
     }
 }
 
-std::string CLCompileContext::generate_build_options(const StringSet &build_options_set, const std::string &kernel_path) const
+std::string CLCompileContext::generate_build_options(const StringSet   &build_options_set,
+                                                     const std::string &kernel_path) const
 {
     std::string concat_str;
     bool        ext_supported = false;
@@ -241,27 +255,27 @@
 #endif // defined(ARM_COMPUTE_DEBUG_ENABLED)
 
     GPUTarget gpu_arch = get_arch_from_target(_device.target());
-    concat_str += " -DGPU_ARCH=" + support::cpp11::to_string(
-                      static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
+    concat_str +=
+        " -DGPU_ARCH=" + support::cpp11::to_string(static_cast<std::underlying_type<GPUTarget>::type>(gpu_arch));
 
-    if(_device.supported("cl_khr_fp16"))
+    if (_device.supported("cl_khr_fp16"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_FP16_ENABLED=1 ";
     }
 
-    if(_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
+    if (_device.supported("cl_arm_integer_dot_product_int8") || _device.supported("cl_khr_integer_dot_product"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ENABLED=1 ";
     }
 
-    if(_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
+    if (_device.supported("cl_arm_integer_dot_product_accumulate_int8"))
     {
         concat_str += " -DARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED=1 ";
     }
 
     std::tie(ext_supported, ext_buildopts) = _device.is_non_uniform_workgroup_supported();
 
-    if(ext_supported)
+    if (ext_supported)
     {
         concat_str += ext_buildopts;
     }
@@ -270,7 +284,7 @@
         ARM_COMPUTE_ERROR("Non uniform workgroup size is not supported!!");
     }
 
-    if(gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
+    if (gpu_arch != GPUTarget::UNKNOWN && gpu_arch != GPUTarget::MIDGARD && get_ddk_version() >= 11)
     {
         concat_str += " -DUNROLL_WITH_PRAGMA ";
     }
@@ -295,7 +309,7 @@
 #endif /* EMBEDDED_KERNELS */
 
     // Concatenate set
-    for(const auto &el : s)
+    for (const auto &el : s)
     {
         concat_set += " " + el;
     }
@@ -340,7 +354,7 @@
     GPUTarget   _target = get_target_from_device(_device.cl_device());
     cl::NDRange default_range;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::MIDGARD:
         case GPUTarget::T600:
@@ -370,7 +384,8 @@
     size_t result;
 
     size_t err = kernel.getWorkGroupInfo(_device.cl_device(), CL_KERNEL_WORK_GROUP_SIZE, &result);
-    ARM_COMPUTE_ERROR_ON_MSG(err != 0, "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
+    ARM_COMPUTE_ERROR_ON_MSG(err != 0,
+                             "clGetKernelWorkGroupInfo failed to return the maximum workgroup size for the kernel");
     ARM_COMPUTE_UNUSED(err);
 
     return result;
@@ -392,7 +407,7 @@
     const std::regex  ddk_regex("r([0-9]*)p[0-9]");
     std::smatch       ddk_match;
 
-    if(std::regex_search(device_version, ddk_match, ddk_regex))
+    if (std::regex_search(device_version, ddk_match, ddk_regex))
     {
         return std::stoi(ddk_match[1]);
     }

diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp
index 78f3610..5ea99d3 100644
--- a/src/core/CL/CLHelpers.cpp
+++ b/src/core/CL/CLHelpers.cpp

@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLHelpers.h"
+
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
-#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Types.h"
-#include "src/gpu/cl/ClCompileContext.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 
+#include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/ClKernelLibrary.h"
 
 #include <utility>
@@ -39,7 +40,7 @@
 {
 std::string get_cl_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -75,7 +76,7 @@
 
 std::string get_cl_promoted_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -105,7 +106,7 @@
 
 std::string get_cl_unsigned_type_from_element_size(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return "uchar";
@@ -123,7 +124,7 @@
 
 std::string get_cl_signed_type_from_element_size(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return "char";
@@ -141,7 +142,7 @@
 
 std::string get_cl_select_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -174,7 +175,7 @@
 
 std::string get_cl_dot8_acc_type_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -192,7 +193,7 @@
 
 std::string get_data_size_from_data_type(const DataType &dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::S8:
@@ -244,8 +245,9 @@
     const GPUTarget gpu_target  = get_target_from_name(device_name);
 
     // SW_WORKAROUND: Workaround for DDK revision r14p0.to enable cl_arm_integer_dot_product_int8
-    std::set<GPUTarget> sw_workaround_issue = { GPUTarget::G76 };
-    return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") || sw_workaround_issue.count(gpu_target) != 0);
+    std::set<GPUTarget> sw_workaround_issue = {GPUTarget::G76};
+    return (device_supports_extension(device, "cl_arm_integer_dot_product_int8") ||
+            sw_workaround_issue.count(gpu_target) != 0);
 }
 
 bool dot8_acc_supported(const cl::Device &device)
@@ -256,23 +258,23 @@
 CLVersion get_cl_version(const cl::Device &device)
 {
     std::string version_str = device.getInfo<CL_DEVICE_VERSION>();
-    if(version_str.find("OpenCL 3") != std::string::npos)
+    if (version_str.find("OpenCL 3") != std::string::npos)
     {
         return CLVersion::CL30;
     }
-    else if(version_str.find("OpenCL 2") != std::string::npos)
+    else if (version_str.find("OpenCL 2") != std::string::npos)
     {
         return CLVersion::CL20;
     }
-    else if(version_str.find("OpenCL 1.2") != std::string::npos)
+    else if (version_str.find("OpenCL 1.2") != std::string::npos)
     {
         return CLVersion::CL12;
     }
-    else if(version_str.find("OpenCL 1.1") != std::string::npos)
+    else if (version_str.find("OpenCL 1.1") != std::string::npos)
     {
         return CLVersion::CL11;
     }
-    else if(version_str.find("OpenCL 1.0") != std::string::npos)
+    else if (version_str.find("OpenCL 1.0") != std::string::npos)
     {
         return CLVersion::CL10;
     }
@@ -287,14 +289,15 @@
     return (pos != std::string::npos);
 }
 
-bool cl_winograd_convolution_layer_supported(const Size2D &output_tile, const Size2D &kernel_size, DataLayout data_layout)
+bool cl_winograd_convolution_layer_supported(const Size2D &output_tile,
+                                             const Size2D &kernel_size,
+                                             DataLayout    data_layout)
 {
     ARM_COMPUTE_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> winograd_configs_nchw =
-    {
+    std::vector<WinogradConfiguration> winograd_configs_nchw = {
         WinogradConfiguration(std::pair<int, int>(1, 2), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(2, 1), std::pair<int, int>(3, 1)),
@@ -303,11 +306,9 @@
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
         WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(5, 1)),
-        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))
-    };
+        WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 5))};
 
-    std::vector<WinogradConfiguration> winograd_configs_nhwc =
-    {
+    std::vector<WinogradConfiguration> winograd_configs_nhwc = {
         WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(3, 3)),
         WinogradConfiguration(std::pair<int, int>(1, 4), std::pair<int, int>(1, 3)),
         WinogradConfiguration(std::pair<int, int>(4, 1), std::pair<int, int>(3, 1)),
@@ -324,19 +325,21 @@
                             std::pair<int, int>(kernel_size.width, kernel_size.height));
 
     // Return true if supported
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) != winograd_configs_nchw.end());
+        return (std::find(winograd_configs_nchw.begin(), winograd_configs_nchw.end(), p) !=
+                winograd_configs_nchw.end());
     }
     else
     {
-        return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) != winograd_configs_nhwc.end());
+        return (std::find(winograd_configs_nhwc.begin(), winograd_configs_nhwc.end(), p) !=
+                winograd_configs_nhwc.end());
     }
 }
 
 size_t preferred_vector_width(const cl::Device &device, const DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::S8:
@@ -382,7 +385,7 @@
 
     cl_int err = clGetDeviceInfo(device(), CL_DEVICE_IMAGE_PITCH_ALIGNMENT, sizeof(cl_uint), &pixel_aligment, nullptr);
 
-    if(err == CL_SUCCESS)
+    if (err == CL_SUCCESS)
     {
         return pixel_aligment;
     }
@@ -396,12 +399,14 @@
 {
     cl_bool supported = CL_FALSE;
 
-    cl_int err = clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr);
+    cl_int err =
+        clGetDeviceInfo(device(), CL_DEVICE_NON_UNIFORM_WORK_GROUP_SUPPORT, sizeof(cl_bool), &supported, nullptr);
 
     return (err == CL_SUCCESS && supported == CL_TRUE);
 }
 
-cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
+cl::Kernel
+create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set<std::string> &build_opts)
 {
     opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
 
@@ -409,7 +414,8 @@
     auto              kernel_src   = klib.program(program_name);
     const std::string kernel_path  = klib.kernel_path();
 
-    return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path, build_opts, kernel_src.is_binary));
+    return static_cast<cl::Kernel>(ctx.create_kernel(kernel_name, program_name, kernel_src.program, kernel_path,
+                                                     build_opts, kernel_src.is_binary));
 }
 
 cl::NDRange create_lws_hint_parallel_implementations(unsigned int input_dimension, unsigned int vector_size)
@@ -423,8 +429,9 @@
 bool get_wbsm_support_info(const cl::Device &device)
 {
     cl_bitfield capabilities = 0;
-    cl_int      err          = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield), &capabilities, nullptr);
-    if((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
+    cl_int      err = clGetDeviceInfo(device.get(), CL_DEVICE_SCHEDULING_CONTROLS_CAPABILITIES_ARM, sizeof(cl_bitfield),
+                                      &capabilities, nullptr);
+    if ((err == CL_SUCCESS) && (capabilities & CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM))
     {
         return true;
     }
@@ -433,35 +440,33 @@
 
 void set_wbsm(cl::Kernel &kernel, cl_int wbsm_hint)
 {
-    cl_int err = clSetKernelExecInfo(kernel.get(),
-                                     CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
-                                     sizeof(cl_int),
-                                     &wbsm_hint);
+    cl_int err = clSetKernelExecInfo(kernel.get(), CL_KERNEL_EXEC_INFO_WORKGROUP_BATCH_SIZE_MODIFIER_ARM,
+                                     sizeof(cl_int), &wbsm_hint);
     ARM_COMPUTE_UNUSED(err);
     ARM_COMPUTE_ERROR_ON(err != CL_SUCCESS);
 }
 
 bool export_to_cl_image(const ITensorInfo *tensor)
 {
-    if(tensor->tensor_shape()[0] % 4 != 0)
+    if (tensor->tensor_shape()[0] % 4 != 0)
     {
         return false;
     }
 
     // If not floating point
-    if(!is_data_type_float(tensor->data_type()))
+    if (!is_data_type_float(tensor->data_type()))
     {
         return false;
     }
 
     // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform
-    if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
+    if (!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()))
     {
         return false;
     }
 
     // Check cl image pitch alignment
-    if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
+    if (get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0)
     {
         return false;
     }
@@ -471,7 +476,7 @@
     const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
     const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
 
-    if(image_w > max_image_w || image_h > max_image_h)
+    if (image_w > max_image_w || image_h > max_image_h)
     {
         return false;
     }
@@ -481,9 +486,9 @@
 
 void set_unroll_with_pragma(CLBuildOptions &built_opts, std::initializer_list<int> values)
 {
-    for(const int value : values)
+    for (const int value : values)
     {
-        if(value > max_manual_loop_unrolling)
+        if (value > max_manual_loop_unrolling)
         {
             built_opts.add_option("-DUNROLL_WITH_PRAGMA");
             return;

diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index c5a0796..e69d006 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp

@@ -22,8 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "arm_compute/core/Error.h"
+
 #include "src/gpu/cl/ClKernelLibrary.h"
+
 #include <algorithm>
 #include <array>
 #include <fstream>
@@ -31,8 +34,7 @@
 #include <vector>
 namespace arm_compute
 {
-CLKernelLibrary::CLKernelLibrary()
-    : _compile_context()
+CLKernelLibrary::CLKernelLibrary() : _compile_context()
 {
     opencl_is_available(); // Make sure the OpenCL symbols are initialised *before* the CLKernelLibrary is built
 }
@@ -41,13 +43,15 @@
     static CLKernelLibrary _kernel_library;
     return _kernel_library;
 }
-Kernel CLKernelLibrary::create_kernel(const std::string &kernel_name, const std::set<std::string> &build_options_set) const
+Kernel CLKernelLibrary::create_kernel(const std::string           &kernel_name,
+                                      const std::set<std::string> &build_options_set) const
 {
     const opencl::ClKernelLibrary &klib         = opencl::ClKernelLibrary::get();
     const std::string              program_name = klib.program_name(kernel_name);
     auto                           program      = klib.program(program_name);
     const std::string             &kernel_path  = CLKernelLibrary::get().get_kernel_path();
-    return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set, program.is_binary);
+    return _compile_context.create_kernel(kernel_name, program_name, program.program, kernel_path, build_options_set,
+                                          program.is_binary);
 }
 std::string CLKernelLibrary::get_program_name(const std::string &kernel_name) const
 {
@@ -131,4 +135,4 @@
 {
     return _compile_context;
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/CL/CLMutableCommandBuffer.cpp b/src/core/CL/CLMutableCommandBuffer.cpp
index b9c59ac..05b351f 100644
--- a/src/core/CL/CLMutableCommandBuffer.cpp
+++ b/src/core/CL/CLMutableCommandBuffer.cpp

@@ -31,8 +31,7 @@
 namespace arm_compute
 {
 
-CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue)
-    : CLCommandBuffer()
+CLMutableCommandBuffer::CLMutableCommandBuffer(cl_command_queue queue) : CLCommandBuffer()
 {
     cl_int status = CL_SUCCESS;
 
@@ -52,7 +51,10 @@
     handle_cl_error("clReleaseCommandBufferKHR", status);
 }
 
-void CLMutableCommandBuffer::add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local)
+void CLMutableCommandBuffer::add_kernel(cl_kernel          kernel,
+                                        const cl::NDRange &offset,
+                                        const cl::NDRange &global,
+                                        const cl::NDRange &local)
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Created);
 
@@ -65,18 +67,8 @@
     };
 
     const auto error = clCommandNDRangeKernelKHR(
-        _cb,
-        nullptr,
-        properties,
-        kernel,
-        global.dimensions(),
-        offset.dimensions() != 0 ? offset.get() : nullptr,
-        global.get(),
-        local.dimensions() != 0 ? local.get() : nullptr,
-        0,
-        nullptr,
-        nullptr,
-        &mutable_handle);
+        _cb, nullptr, properties, kernel, global.dimensions(), offset.dimensions() != 0 ? offset.get() : nullptr,
+        global.get(), local.dimensions() != 0 ? local.get() : nullptr, 0, nullptr, nullptr, &mutable_handle);
 
     handle_cl_error("clCommandNDRangeKernelKHR", error);
 
@@ -114,7 +106,7 @@
 
     size_t arg_no = 0;
 
-    for(auto &mut_dispatch_cfg : _mut_dispatch_cfgs)
+    for (auto &mut_dispatch_cfg : _mut_dispatch_cfgs)
     {
         ARM_COMPUTE_ERROR_ON(arg_no >= _mut_arg_cfgs.size());
         mut_dispatch_cfg.arg_list = &_mut_arg_cfgs[arg_no];
@@ -132,9 +124,7 @@
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
 
-    const auto error = clUpdateMutableCommandsKHR(
-        _cb,
-        &_mut_cfg);
+    const auto error = clUpdateMutableCommandsKHR(_cb, &_mut_cfg);
 
     handle_cl_error("clUpdateMutableCommandsKHR", error);
 }
@@ -143,13 +133,7 @@
 {
     ARM_COMPUTE_ERROR_ON(state() != State::Finalized);
 
-    const auto error = clEnqueueCommandBufferKHR(
-        0,
-        nullptr,
-        _cb,
-        0,
-        nullptr,
-        nullptr);
+    const auto error = clEnqueueCommandBufferKHR(0, nullptr, _cb, 0, nullptr, nullptr);
 
     handle_cl_error("clEnqueueCommandBufferKHR", error);
 }

diff --git a/src/core/CL/CLMutableCommandBuffer.h b/src/core/CL/CLMutableCommandBuffer.h
index 04e94b0..8997d7d 100644
--- a/src/core/CL/CLMutableCommandBuffer.h
+++ b/src/core/CL/CLMutableCommandBuffer.h

@@ -57,7 +57,10 @@
     /** Disallow move assignment. */
     CLMutableCommandBuffer &operator=(CLMutableCommandBuffer &&) = delete;
 
-    void add_kernel(cl_kernel kernel, const cl::NDRange &offset, const cl::NDRange &global, const cl::NDRange &local) override;
+    void add_kernel(cl_kernel          kernel,
+                    const cl::NDRange &offset,
+                    const cl::NDRange &global,
+                    const cl::NDRange &local) override;
 
     void finalize() override;
 

diff --git a/src/core/CL/CLUtils.cpp b/src/core/CL/CLUtils.cpp
index 289300b..290ed32 100644
--- a/src/core/CL/CLUtils.cpp
+++ b/src/core/CL/CLUtils.cpp

@@ -26,9 +26,10 @@
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -38,15 +39,15 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
     const cl::Context &ctx    = CLKernelLibrary::get().context();
-    const cl::Buffer &buffer = tensor->cl_buffer();
+    const cl::Buffer  &buffer = tensor->cl_buffer();
     const ITensorInfo *info   = tensor->info();
-    ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(),
-                             "Tensor paddings must not be locked to allow extending paddings to satisfy cl_image pitch alignment requirement");
+    ARM_COMPUTE_ERROR_ON_MSG(info->lock_paddings(), "Tensor paddings must not be locked to allow extending paddings to "
+                                                    "satisfy cl_image pitch alignment requirement");
 
-    const size_t image_w{ info->dimension(0) / 4 };
-    const size_t image_h{ info->tensor_shape().total_size() / info->dimension(0) };
-    const size_t max_image_w{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>() };
-    const size_t max_image_h{ CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>() };
+    const size_t image_w{info->dimension(0) / 4};
+    const size_t image_h{info->tensor_shape().total_size() / info->dimension(0)};
+    const size_t max_image_w{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>()};
+    const size_t max_image_h{CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>()};
 
     ARM_COMPUTE_UNUSED(max_image_w, max_image_h);
     ARM_COMPUTE_ERROR_ON_MSG(image_w > max_image_w, "Image width exceeds maximum width for exporting to cl_image");
@@ -58,18 +59,22 @@
     return create_image2d_from_buffer(ctx, buffer, shape2d, info->data_type(), image_row_pitch, image_type);
 }
 
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type)
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+                                       const cl::Buffer  &buffer,
+                                       const TensorShape &shape2d,
+                                       DataType           data_type,
+                                       size_t             image_row_pitch,
+                                       CLImage2DType      image_type)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
                              "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
     ARM_COMPUTE_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
                              "Impossible to retrieve the cl_image pitch alignment");
-    ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr,
-                             "Cannot create cl_image from empty cl_buffer");
+    ARM_COMPUTE_ERROR_ON_MSG(buffer.get() == nullptr, "Cannot create cl_image from empty cl_buffer");
 
     cl_channel_type cl_data_type;
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::F32:
             cl_data_type = CL_FLOAT;
@@ -84,7 +89,7 @@
     cl_mem cl_image;
     cl_int err = CL_SUCCESS;
 
-    const cl_image_format format = { CL_RGBA, cl_data_type };
+    const cl_image_format format = {CL_RGBA, cl_data_type};
 
     cl_image_desc desc;
     memset(&desc, 0, sizeof(desc));
@@ -94,7 +99,7 @@
     desc.image_width     = shape2d[0];
     desc.image_height    = shape2d[1];
 
-    switch(image_type)
+    switch (image_type)
     {
         case CLImage2DType::ReadOnly:
             cl_image = clCreateImage(ctx(), CL_MEM_READ_ONLY, &format, &desc, nullptr, &err);
@@ -114,7 +119,7 @@
 
 void handle_cl_error(const std::string &function_name, cl_int error_code)
 {
-    if(error_code != CL_SUCCESS)
+    if (error_code != CL_SUCCESS)
     {
         std::string error_message = function_name + " - Error code: " + std::to_string(error_code);
         ARM_COMPUTE_ERROR(error_message.c_str());

diff --git a/src/core/CL/CLUtils.h b/src/core/CL/CLUtils.h
index de9c1b3..f9dcfea 100644
--- a/src/core/CL/CLUtils.h
+++ b/src/core/CL/CLUtils.h

@@ -72,7 +72,12 @@
  *
  * @return cl::Image2D object
  */
-cl::Image2D create_image2d_from_buffer(const cl::Context &ctx, const cl::Buffer &buffer, const TensorShape &shape2d, DataType data_type, size_t image_row_pitch, CLImage2DType image_type);
+cl::Image2D create_image2d_from_buffer(const cl::Context &ctx,
+                                       const cl::Buffer  &buffer,
+                                       const TensorShape &shape2d,
+                                       DataType           data_type,
+                                       size_t             image_row_pitch,
+                                       CLImage2DType      image_type);
 
 /** Check for CL error code and throw exception accordingly.
  *

diff --git a/src/core/CL/CLValidate.h b/src/core/CL/CLValidate.h
index 7b5294e..50d224f 100644
--- a/src/core/CL/CLValidate.h
+++ b/src/core/CL/CLValidate.h

@@ -29,11 +29,13 @@
 
 namespace arm_compute
 {
-#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_ERROR_ON_F16_UNSUPPORTED(tensor)                                                          \
+    ARM_COMPUTE_ERROR_THROW_ON(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+                                                                        CLKernelLibrary::get().fp16_supported()))
 
-#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor) \
-    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, CLKernelLibrary::get().fp16_supported()))
+#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)                                                    \
+    ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_fp16(__func__, __FILE__, __LINE__, tensor, \
+                                                                         CLKernelLibrary::get().fp16_supported()))
 
 /** Return an error if int64_base_atomics extension is not supported by the device.
  *
@@ -43,11 +45,13 @@
  *
  * @return Status
  */
-inline arm_compute::Status error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
+inline arm_compute::Status
+error_on_unsupported_int64_base_atomics(const char *function, const char *file, const int line)
 {
-    if(!CLKernelLibrary::get().int64_base_atomics_supported())
+    if (!CLKernelLibrary::get().int64_base_atomics_supported())
     {
-        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line, "Atomic functions are not supported");
+        return ARM_COMPUTE_CREATE_ERROR_LOC(arm_compute::ErrorCode::UNSUPPORTED_EXTENSION_USE, function, file, line,
+                                            "Atomic functions are not supported");
     }
     return arm_compute::Status{};
 }

diff --git a/src/core/CL/DefaultLWSHeuristics.cpp b/src/core/CL/DefaultLWSHeuristics.cpp
index a53fdbb..f96b24d 100644
--- a/src/core/CL/DefaultLWSHeuristics.cpp
+++ b/src/core/CL/DefaultLWSHeuristics.cpp

@@ -31,13 +31,13 @@
 {
     ARM_COMPUTE_UNUSED(gws_y);
 
-    if(gws_z != 1)
+    if (gws_z != 1)
     {
         return cl::NDRange(4, 4, 2);
     }
     else
     {
-        if(gws_x > 256)
+        if (gws_x > 256)
         {
             return cl::NDRange(2, 16, 1);
         }
@@ -59,9 +59,9 @@
 {
     ARM_COMPUTE_UNUSED(gws_z);
 
-    if(gws_x < gws_y)
+    if (gws_x < gws_y)
     {
-        if(gws_x < 4)
+        if (gws_x < 4)
         {
             return cl::NDRange(std::min(gws_x, static_cast<size_t>(2u)), 32, 1);
         }
@@ -81,7 +81,7 @@
     ARM_COMPUTE_UNUSED(gws_y);
     ARM_COMPUTE_UNUSED(gws_z);
 
-    if(gws_x < 32)
+    if (gws_x < 32)
     {
         return cl::NDRange(gws_x, 4, 4);
     }
@@ -100,7 +100,7 @@
     const size_t gws_y = gws[1];
     const size_t gws_z = gws[2];
 
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLKernelType::GEMM:
         {
@@ -124,4 +124,4 @@
         }
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/CL/ICLKernel.cpp b/src/core/CL/ICLKernel.cpp
index dc3a86a..ac53e7f 100644
--- a/src/core/CL/ICLKernel.cpp
+++ b/src/core/CL/ICLKernel.cpp

@@ -25,18 +25,23 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/Utils.h"
 
 #include <cstddef>
 
-void arm_compute::enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint, bool use_dummy_work_items)
+void arm_compute::enqueue(cl::CommandQueue  &queue,
+                          ICLKernel         &kernel,
+                          const Window      &window,
+                          const cl::NDRange &lws_hint,
+                          bool               use_dummy_work_items)
 {
-    if(kernel.kernel()() == nullptr)
+    if (kernel.kernel()() == nullptr)
     {
         return;
     }
 
-    for(unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_ERROR_ON(window[i].step() == 0);
         // Make sure that dimensions > Z are 1
@@ -46,7 +51,7 @@
     cl::NDRange gws = ICLKernel::gws_from_window(window, use_dummy_work_items);
 
     // Check for empty NDRange
-    if(gws.dimensions() == 0)
+    if (gws.dimensions() == 0)
     {
         return;
     }
@@ -54,7 +59,7 @@
     kernel.cache_gws(gws);
 
     cl::NDRange valid_lws;
-    if(lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
+    if (lws_hint[0] * lws_hint[1] * lws_hint[2] > kernel.get_max_workgroup_size())
     {
         valid_lws = cl::NullRange;
     }
@@ -65,12 +70,12 @@
 
     cl::NDRange lws = cl::NullRange;
 
-    if((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
+    if ((valid_lws[0] <= gws[0]) && (valid_lws[1] <= gws[1]) && (valid_lws[2] <= gws[2]))
     {
         lws = valid_lws;
     }
 
-    if(CLKernelLibrary::get().is_wbsm_supported())
+    if (CLKernelLibrary::get().is_wbsm_supported())
     {
         set_wbsm(kernel.kernel(), kernel.wbsm_hint());
     }
@@ -90,7 +95,7 @@
     // Calculate offset to the start of the window
     unsigned int offset_first_element = info->offset_first_element_in_bytes();
 
-    for(unsigned int n = 0; n < info->num_dimensions(); ++n)
+    for (unsigned int n = 0; n < info->num_dimensions(); ++n)
     {
         offset_first_element += (window.is_broadcasted(n) ? 0 : window[n].start()) * strides[n];
     }
@@ -98,7 +103,7 @@
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, tensor->cl_buffer());
 
-    for(unsigned int d = 0; d < dimension_size; ++d)
+    for (unsigned int d = 0; d < dimension_size; ++d)
     {
         _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : strides[d]);
         _kernel.setArg<cl_uint>(idx++, window.is_broadcasted(d) ? 0 : (strides[d] * window[d].step()));
@@ -107,7 +112,8 @@
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
 
     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_tensor<dimension_size>() != idx,
-                                 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_tensor<dimension_size>());
+                                 "add_%dD_tensor_argument() is supposed to add exactly %d arguments to the kernel",
+                                 dimension_size, num_arguments_per_tensor<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
 
@@ -178,7 +184,7 @@
 
 size_t ICLKernel::get_max_workgroup_size()
 {
-    if(_max_workgroup_size == 0)
+    if (_max_workgroup_size == 0)
     {
         _max_workgroup_size = CLKernelLibrary::get().max_local_workgroup_size(_kernel);
     }
@@ -187,7 +193,7 @@
 
 cl::NDRange ICLKernel::gws_from_window(const Window &window, bool use_dummy_work_items)
 {
-    if((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
+    if ((window.x().end() - window.x().start()) == 0 || (window.y().end() - window.y().start()) == 0)
     {
         return cl::NullRange;
     }
@@ -196,7 +202,7 @@
                     (window.y().end() - window.y().start()) / window.y().step(),
                     (window.z().end() - window.z().start()) / window.z().step());
 
-    if(use_dummy_work_items)
+    if (use_dummy_work_items)
     {
         gws.get()[0] = get_next_power_two(gws[0]);
         gws.get()[1] = get_next_power_two(gws[1]);

diff --git a/src/core/CL/ICLKernel.h b/src/core/CL/ICLKernel.h
index c82809c..6aebef1 100644
--- a/src/core/CL/ICLKernel.h
+++ b/src/core/CL/ICLKernel.h

@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/CLTypes.h"
 #include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/IKernel.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTuningParams.h"
 
 #include "src/core/CL/DefaultLWSHeuristics.h"
@@ -43,14 +43,14 @@
 {
 bool is_same_lws(cl::NDRange lws0, cl::NDRange lws1)
 {
-    if(lws0.dimensions() != lws1.dimensions())
+    if (lws0.dimensions() != lws1.dimensions())
     {
         return false;
     }
 
-    for(size_t i = 0; i < lws0.dimensions(); ++i)
+    for (size_t i = 0; i < lws0.dimensions(); ++i)
     {
-        if(lws0.get()[i] != lws1.get()[i])
+        if (lws0.get()[i] != lws1.get()[i])
         {
             return false;
         }
@@ -71,7 +71,7 @@
      *
      * @return The number of arguments enqueued per array object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_array()
     {
         return num_arguments_per_tensor<dimension_size>();
@@ -80,7 +80,7 @@
      *
      * @return The number of arguments enqueued per tensor object.
      */
-    template <unsigned int        dimension_size>
+    template <unsigned int dimension_size>
     constexpr static unsigned int num_arguments_per_tensor()
     {
         return 2 + 2 * dimension_size;
@@ -116,11 +116,13 @@
      * @param[in] window             The maximum window which will be returned by window()
      * @param[in] tuning_params_hint (Optional) Tuning parameters to use.
      */
-    void configure_internal(const Window &window, CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(), 0))
+    void configure_internal(const Window  &window,
+                            CLTuningParams tuning_params_hint = CLTuningParams(CLKernelLibrary::get().default_ndrange(),
+                                                                               0))
     {
         _tuning_params_hint = tuning_params_hint;
 
-        if(is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
+        if (is_same_lws(_tuning_params_hint.get_lws(), CLKernelLibrary::get().default_ndrange()))
         {
             // Disable use_dummy_work_items at configure time. Because dummy work items only affect gws size, which
             // will be recalculated with use_dummy_work_items flag at run time again anyway.
@@ -133,7 +135,13 @@
 public:
     /** Constructor */
     ICLKernel()
-        : _kernel(nullptr), _target(GPUTarget::MIDGARD), _config_id(arm_compute::default_config_id), _max_workgroup_size(0), _type(CLKernelType::UNKNOWN), _tuning_params_hint(), _cached_gws(cl::NullRange)
+        : _kernel(nullptr),
+          _target(GPUTarget::MIDGARD),
+          _config_id(arm_compute::default_config_id),
+          _max_workgroup_size(0),
+          _type(CLKernelType::UNKNOWN),
+          _tuning_params_hint(),
+          _cached_gws(cl::NullRange)
     {
     }
     /** Returns a reference to the OpenCL kernel of this object.
@@ -161,7 +169,11 @@
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T>
-    void add_1D_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+    void add_1D_array_argument(unsigned int      &idx,
+                               const ICLArray<T> *array,
+                               const Strides     &strides,
+                               unsigned int       num_dimensions,
+                               const Window      &window)
     {
         add_array_argument<T, 1>(idx, array, strides, num_dimensions, window);
     }
@@ -184,7 +196,7 @@
      */
     void add_1D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_1D_tensor_argument(idx, tensor, window);
         }
@@ -208,7 +220,7 @@
      */
     void add_2D_tensor_argument_if(bool cond, unsigned int &idx, const ICLTensor *tensor, const Window &window)
     {
-        if(cond)
+        if (cond)
         {
             add_2D_tensor_argument(idx, tensor, window);
         }
@@ -469,7 +481,11 @@
      * @param[in]     window         Window the kernel will be executed on.
      */
     template <typename T, unsigned int dimension_size>
-    void add_array_argument(unsigned int &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window);
+    void add_array_argument(unsigned int      &idx,
+                            const ICLArray<T> *array,
+                            const Strides     &strides,
+                            unsigned int       num_dimensions,
+                            const Window      &window);
     /** Add the passed tensor's parameters to the object's kernel's arguments starting from the index idx.
      *
      * @param[in,out] idx    Index at which to start adding the tensor's arguments. Will be incremented by the number of kernel arguments set.
@@ -505,7 +521,11 @@
  *
  * @note If any dimension of the lws is greater than the global workgroup size then no lws will be passed.
  */
-void enqueue(cl::CommandQueue &queue, ICLKernel &kernel, const Window &window, const cl::NDRange &lws_hint = CLKernelLibrary::get().default_ndrange(), bool use_dummy_work_items = false);
+void enqueue(cl::CommandQueue  &queue,
+             ICLKernel         &kernel,
+             const Window      &window,
+             const cl::NDRange &lws_hint             = CLKernelLibrary::get().default_ndrange(),
+             bool               use_dummy_work_items = false);
 
 /** Add the passed array's parameters to the object's kernel's arguments starting from the index idx.
  *
@@ -516,14 +536,15 @@
  * @param[in]     window         Window the kernel will be executed on.
  */
 template <typename T, unsigned int dimension_size>
-void ICLKernel::add_array_argument(unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
+void ICLKernel::add_array_argument(
+    unsigned &idx, const ICLArray<T> *array, const Strides &strides, unsigned int num_dimensions, const Window &window)
 {
     ARM_COMPUTE_ERROR_ON(array == nullptr);
 
     // Calculate offset to the start of the window
     unsigned int offset_first_element = 0;
 
-    for(unsigned int n = 0; n < num_dimensions; ++n)
+    for (unsigned int n = 0; n < num_dimensions; ++n)
     {
         offset_first_element += window[n].start() * strides[n];
     }
@@ -531,7 +552,7 @@
     unsigned int idx_start = idx;
     _kernel.setArg(idx++, array->cl_buffer());
 
-    for(unsigned int dimension = 0; dimension < dimension_size; dimension++)
+    for (unsigned int dimension = 0; dimension < dimension_size; dimension++)
     {
         _kernel.setArg<cl_uint>(idx++, strides[dimension]);
         _kernel.setArg<cl_uint>(idx++, strides[dimension] * window[dimension].step());
@@ -540,8 +561,9 @@
     _kernel.setArg<cl_uint>(idx++, offset_first_element);
 
     ARM_COMPUTE_ERROR_ON_MSG_VAR(idx_start + num_arguments_per_array<dimension_size>() != idx,
-                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel", dimension_size, num_arguments_per_array<dimension_size>());
+                                 "add_%dD_array_argument() is supposed to add exactly %d arguments to the kernel",
+                                 dimension_size, num_arguments_per_array<dimension_size>());
     ARM_COMPUTE_UNUSED(idx_start);
 }
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLKERNEL_H */

diff --git a/src/core/CL/ICLSimple2DKernel.cpp b/src/core/CL/ICLSimple2DKernel.cpp
index 5d8295b..3f7edbb 100644
--- a/src/core/CL/ICLSimple2DKernel.cpp
+++ b/src/core/CL/ICLSimple2DKernel.cpp

@@ -40,6 +40,5 @@
         add_2D_tensor_argument(idx, _input, slice);
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }

diff --git a/src/core/CL/ICLSimple2DKernel.h b/src/core/CL/ICLSimple2DKernel.h
index 5246492..97bc1e5 100644
--- a/src/core/CL/ICLSimple2DKernel.h
+++ b/src/core/CL/ICLSimple2DKernel.h

@@ -37,5 +37,5 @@
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLE2DKERNEL_H */

diff --git a/src/core/CL/ICLSimple3DKernel.cpp b/src/core/CL/ICLSimple3DKernel.cpp
index fef1a86..71d7d1f 100644
--- a/src/core/CL/ICLSimple3DKernel.cpp
+++ b/src/core/CL/ICLSimple3DKernel.cpp

@@ -42,6 +42,5 @@
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/ICLSimple3DKernel.h b/src/core/CL/ICLSimple3DKernel.h
index ff0b274..5071b6b 100644
--- a/src/core/CL/ICLSimple3DKernel.h
+++ b/src/core/CL/ICLSimple3DKernel.h

@@ -39,5 +39,5 @@
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 };
-}
+} // namespace arm_compute
 #endif /*ARM_COMPUTE_ICLSIMPLE3DKERNEL_H */

diff --git a/src/core/CL/ICLSimpleKernel.cpp b/src/core/CL/ICLSimpleKernel.cpp
index d67fefd..c31db83 100644
--- a/src/core/CL/ICLSimpleKernel.cpp
+++ b/src/core/CL/ICLSimpleKernel.cpp

@@ -22,30 +22,35 @@
  * SOFTWARE.
  */
 #include "src/core/CL/ICLSimpleKernel.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 using namespace arm_compute;
 
-ICLSimpleKernel::ICLSimpleKernel()
-    : _input(nullptr), _output(nullptr)
+ICLSimpleKernel::ICLSimpleKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
-void ICLSimpleKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined, const BorderSize &border_size)
+void ICLSimpleKernel::configure(const ICLTensor  *input,
+                                ICLTensor        *output,
+                                unsigned int      num_elems_processed_per_iteration,
+                                bool              border_undefined,
+                                const BorderSize &border_size)
 {
     _input  = input;
     _output = output;
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
+    Window win =
+        calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size);
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
     output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size);

diff --git a/src/core/CL/ICLSimpleKernel.h b/src/core/CL/ICLSimpleKernel.h
index b35547a..6afd730 100644
--- a/src/core/CL/ICLSimpleKernel.h
+++ b/src/core/CL/ICLSimpleKernel.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -55,12 +56,16 @@
      * @param[in]  border_undefined                  (Optional) True if the border mode is undefined. False if it's replicate or constant.
      * @param[in]  border_size                       (Optional) Size of the border.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, unsigned int num_elems_processed_per_iteration, bool border_undefined = false, const BorderSize &border_size = BorderSize());
+    void configure(const ICLTensor  *input,
+                   ICLTensor        *output,
+                   unsigned int      num_elems_processed_per_iteration,
+                   bool              border_undefined = false,
+                   const BorderSize &border_size      = BorderSize());
 
 protected:
     const ICLTensor *_input;
     ICLTensor       *_output;
 };
-}
+} // namespace arm_compute
 
 #endif /*ARM_COMPUTE_ICLSIMPLEKERNEL_H */

diff --git a/src/core/CL/ICLTensor.cpp b/src/core/CL/ICLTensor.cpp
index b541bff..0771db7 100644
--- a/src/core/CL/ICLTensor.cpp
+++ b/src/core/CL/ICLTensor.cpp

@@ -27,8 +27,7 @@
 
 using namespace arm_compute;
 
-ICLTensor::ICLTensor()
-    : _mapping(nullptr)
+ICLTensor::ICLTensor() : _mapping(nullptr)
 {
 }
 

diff --git a/src/core/CL/OpenCL.cpp b/src/core/CL/OpenCL.cpp
index b092dfb..35421d0 100644
--- a/src/core/CL/OpenCL.cpp
+++ b/src/core/CL/OpenCL.cpp

@@ -36,11 +36,7 @@
 
 namespace arm_compute
 {
-CLSymbols::CLSymbols() noexcept(false)
-    : _loaded(
-{
-    false, false
-})
+CLSymbols::CLSymbols() noexcept(false) : _loaded({false, false})
 {
 }
 
@@ -52,9 +48,9 @@
 
 bool CLSymbols::load_default()
 {
-    static const std::vector<std::string> libraries_filenames{ "libOpenCL.so", "libGLES_mali.so", "libmali.so" };
+    static const std::vector<std::string> libraries_filenames{"libOpenCL.so", "libGLES_mali.so", "libmali.so"};
 
-    if(_loaded.first)
+    if (_loaded.first)
     {
         return _loaded.second;
     }
@@ -62,34 +58,32 @@
     // Indicate that default loading has been tried
     _loaded.first = true;
 
-    if(load(libraries_filenames, /* use_loader */ false))
+    if (load(libraries_filenames, /* use_loader */ false))
     {
-        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from shared library");
+        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+                                 "Failed to load OpenCL symbols from shared library");
         return true;
     }
 
 #ifdef __ANDROID__
     // When running in NDK environment, the above libraries are not accessible.
-    static const std::vector<std::string> android_libraries_filenames{ "libOpenCL-pixel.so", "libOpenCL-car.so" };
+    static const std::vector<std::string> android_libraries_filenames{"libOpenCL-pixel.so", "libOpenCL-car.so"};
 
-    if(load(android_libraries_filenames, /* use_loader */ true))
+    if (load(android_libraries_filenames, /* use_loader */ true))
     {
-        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr, "Failed to load OpenCL symbols from android shared library");
+        ARM_COMPUTE_ERROR_ON_MSG(this->clBuildProgram_ptr == nullptr,
+                                 "Failed to load OpenCL symbols from android shared library");
         return true;
     }
 #endif // __ANDROID__
 
     // If not returned till here then libraries not found
     std::stringstream ss;
-    std::for_each(libraries_filenames.begin(), libraries_filenames.end(), [&ss](const std::string & s)
-    {
-        ss << s << " ";
-    });
+    std::for_each(libraries_filenames.begin(), libraries_filenames.end(),
+                  [&ss](const std::string &s) { ss << s << " "; });
 #ifdef __ANDROID__
-    std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(), [&ss](const std::string & s)
-    {
-        ss << s << " ";
-    });
+    std::for_each(android_libraries_filenames.begin(), android_libraries_filenames.end(),
+                  [&ss](const std::string &s) { ss << s << " "; });
 #endif // __ANDROID__
     std::cerr << "Couldn't find any of the following OpenCL library: " << ss.str() << std::endl;
     return false;
@@ -99,15 +93,15 @@
 {
     void        *handle = nullptr;
     unsigned int index  = 0;
-    for(index = 0; index < libraries_filenames.size(); ++index)
+    for (index = 0; index < libraries_filenames.size(); ++index)
     {
         handle = dlopen(libraries_filenames[index].c_str(), RTLD_LAZY | RTLD_LOCAL);
-        if(handle != nullptr)
+        if (handle != nullptr)
         {
             break;
         }
     }
-    if(index == libraries_filenames.size())
+    if (index == libraries_filenames.size())
     {
         // Set status of loading to failed
         _loaded.second = false;
@@ -115,22 +109,23 @@
     }
 
 #ifdef __ANDROID__
-    typedef void* (*loadOpenCLPointer_t)(const char* name);
+    typedef void *(*loadOpenCLPointer_t)(const char *name);
     loadOpenCLPointer_t loadOpenCLPointer;
-    if (use_loader) {
+    if (use_loader)
+    {
         typedef void (*enableOpenCL_t)();
-        enableOpenCL_t enableOpenCL =
-            reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL"));
+        enableOpenCL_t enableOpenCL = reinterpret_cast<enableOpenCL_t>(dlsym(handle, "enableOpenCL"));
         enableOpenCL();
 
-        loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(
-            dlsym(handle, "loadOpenCLPointer"));
-    } else {
+        loadOpenCLPointer = reinterpret_cast<loadOpenCLPointer_t>(dlsym(handle, "loadOpenCLPointer"));
+    }
+    else
+    {
         loadOpenCLPointer = nullptr;
     }
-#define LOAD_FUNCTION_PTR(func_name, _handle) \
-    func_name##_ptr = reinterpret_cast<decltype(func_name) *>( use_loader ? \
-        loadOpenCLPointer(#func_name) : dlsym(handle, #func_name));
+#define LOAD_FUNCTION_PTR(func_name, _handle)                                                            \
+    func_name##_ptr = reinterpret_cast<decltype(func_name) *>(use_loader ? loadOpenCLPointer(#func_name) \
+                                                                         : dlsym(handle, #func_name));
 #else /* __ANDROID__ */
     (void)use_loader; // Avoid unused warning
 #define LOAD_FUNCTION_PTR(func_name, handle) \
@@ -234,12 +229,11 @@
 }
 } // namespace arm_compute
 
-cl_int clEnqueueMarker(cl_command_queue command_queue,
-                       cl_event        *event)
+cl_int clEnqueueMarker(cl_command_queue command_queue, cl_event *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueMarker_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, event);
     }
@@ -249,12 +243,11 @@
     }
 }
 
-cl_int clWaitForEvents(cl_uint         num_events,
-                       const cl_event *event_list)
+cl_int clWaitForEvents(cl_uint num_events, const cl_event *event_list)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clWaitForEvents_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_events, event_list);
     }
@@ -264,12 +257,18 @@
     }
 }
 
-cl_int clEnqueueSVMMap(cl_command_queue command_queue, cl_bool blocking_map, cl_map_flags flags, void *svm_ptr,
-                       size_t size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMMap(cl_command_queue command_queue,
+                       cl_bool          blocking_map,
+                       cl_map_flags     flags,
+                       void            *svm_ptr,
+                       size_t           size,
+                       cl_uint          num_events_in_wait_list,
+                       const cl_event  *event_wait_list,
+                       cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueSVMMap_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, blocking_map, flags, svm_ptr, size, num_events_in_wait_list, event_wait_list, event);
     }
@@ -279,12 +278,15 @@
     }
 }
 
-cl_int clEnqueueSVMUnmap(cl_command_queue command_queue, void *svm_ptr, cl_uint num_events_in_wait_list,
-                         const cl_event *event_wait_list, cl_event *event)
+cl_int clEnqueueSVMUnmap(cl_command_queue command_queue,
+                         void            *svm_ptr,
+                         cl_uint          num_events_in_wait_list,
+                         const cl_event  *event_wait_list,
+                         cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueSVMUnmap_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, svm_ptr, num_events_in_wait_list, event_wait_list, event);
     }
@@ -298,7 +300,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSVMAlloc_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, size, alignment);
     }
@@ -312,7 +314,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSVMFree_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         func(context, svm_pointer);
     }
@@ -326,7 +328,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetContextInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -343,7 +345,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, device, properties, errcode_ret);
     }
@@ -360,7 +362,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateCommandQueueWithProperties_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, device, properties, errcode_ret);
     }
@@ -370,17 +372,16 @@
     }
 }
 
-cl_context clCreateContext(
-    const cl_context_properties *properties,
-    cl_uint                      num_devices,
-    const cl_device_id          *devices,
-    void (*pfn_notify)(const char *, const void *, size_t, void *),
-    void   *user_data,
-    cl_int *errcode_ret)
+cl_context clCreateContext(const cl_context_properties *properties,
+                           cl_uint                      num_devices,
+                           const cl_device_id          *devices,
+                           void (*pfn_notify)(const char *, const void *, size_t, void *),
+                           void   *user_data,
+                           cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(properties, num_devices, devices, pfn_notify, user_data, errcode_ret);
     }
@@ -398,7 +399,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateContextFromType_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(properties, device_type, pfn_notify, user_data, errcode_ret);
     }
@@ -408,17 +409,16 @@
     }
 }
 
-cl_int clBuildProgram(
-    cl_program          program,
-    cl_uint             num_devices,
-    const cl_device_id *device_list,
-    const char         *options,
-    void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
-    void *user_data)
+cl_int clBuildProgram(cl_program          program,
+                      cl_uint             num_devices,
+                      const cl_device_id *device_list,
+                      const char         *options,
+                      void(CL_CALLBACK *pfn_notify)(cl_program program, void *user_data),
+                      void *user_data)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clBuildProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, num_devices, device_list, options, pfn_notify, user_data);
     }
@@ -428,22 +428,22 @@
     }
 }
 
-cl_int clEnqueueNDRangeKernel(
-    cl_command_queue command_queue,
-    cl_kernel        kernel,
-    cl_uint          work_dim,
-    const size_t    *global_work_offset,
-    const size_t    *global_work_size,
-    const size_t    *local_work_size,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueNDRangeKernel(cl_command_queue command_queue,
+                              cl_kernel        kernel,
+                              cl_uint          work_dim,
+                              const size_t    *global_work_offset,
+                              const size_t    *global_work_size,
+                              const size_t    *local_work_size,
+                              cl_uint          num_events_in_wait_list,
+                              const cl_event  *event_wait_list,
+                              cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueNDRangeKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size,
+                    num_events_in_wait_list, event_wait_list, event);
     }
     else
     {
@@ -451,15 +451,11 @@
     }
 }
 
-cl_int clSetKernelArg(
-    cl_kernel   kernel,
-    cl_uint     arg_index,
-    size_t      arg_size,
-    const void *arg_value)
+cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSetKernelArg_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, arg_index, arg_size, arg_value);
     }
@@ -473,7 +469,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj);
     }
@@ -487,7 +483,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj);
     }
@@ -497,17 +493,16 @@
     }
 }
 
-cl_int clEnqueueUnmapMemObject(
-    cl_command_queue command_queue,
-    cl_mem           memobj,
-    void            *mapped_ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueUnmapMemObject(cl_command_queue command_queue,
+                               cl_mem           memobj,
+                               void            *mapped_ptr,
+                               cl_uint          num_events_in_wait_list,
+                               const cl_event  *event_wait_list,
+                               cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueUnmapMemObject_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, memobj, mapped_ptr, num_events_in_wait_list, event_wait_list, event);
     }
@@ -521,7 +516,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -535,7 +530,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context);
     }
@@ -548,7 +543,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseEvent_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event);
     }
@@ -558,22 +553,22 @@
     }
 }
 
-cl_int clEnqueueWriteBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_write,
-    size_t           offset,
-    size_t           size,
-    const void      *ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueWriteBuffer(cl_command_queue command_queue,
+                            cl_mem           buffer,
+                            cl_bool          blocking_write,
+                            size_t           offset,
+                            size_t           size,
+                            const void      *ptr,
+                            cl_uint          num_events_in_wait_list,
+                            const cl_event  *event_wait_list,
+                            cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueWriteBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, buffer, blocking_write, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                    event);
     }
     else
     {
@@ -581,22 +576,22 @@
     }
 }
 
-cl_int clEnqueueReadBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_read,
-    size_t           offset,
-    size_t           size,
-    void            *ptr,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event)
+cl_int clEnqueueReadBuffer(cl_command_queue command_queue,
+                           cl_mem           buffer,
+                           cl_bool          blocking_read,
+                           size_t           offset,
+                           size_t           size,
+                           void            *ptr,
+                           cl_uint          num_events_in_wait_list,
+                           const cl_event  *event_wait_list,
+                           cl_event        *event)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueReadBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list, event);
+        return func(command_queue, buffer, blocking_read, offset, size, ptr, num_events_in_wait_list, event_wait_list,
+                    event);
     }
     else
     {
@@ -604,17 +599,16 @@
     }
 }
 
-cl_int clGetProgramBuildInfo(
-    cl_program            program,
-    cl_device_id          device,
-    cl_program_build_info param_name,
-    size_t                param_value_size,
-    void                 *param_value,
-    size_t               *param_value_size_ret)
+cl_int clGetProgramBuildInfo(cl_program            program,
+                             cl_device_id          device,
+                             cl_program_build_info param_name,
+                             size_t                param_value_size,
+                             void                 *param_value,
+                             size_t               *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetProgramBuildInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -628,7 +622,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program);
     }
@@ -638,27 +632,27 @@
     }
 }
 
-void *clEnqueueMapBuffer(
-    cl_command_queue command_queue,
-    cl_mem           buffer,
-    cl_bool          blocking_map,
-    cl_map_flags     map_flags,
-    size_t           offset,
-    size_t           size,
-    cl_uint          num_events_in_wait_list,
-    const cl_event *event_wait_list,
-    cl_event        *event,
-    cl_int          *errcode_ret)
+void *clEnqueueMapBuffer(cl_command_queue command_queue,
+                         cl_mem           buffer,
+                         cl_bool          blocking_map,
+                         cl_map_flags     map_flags,
+                         size_t           offset,
+                         size_t           size,
+                         cl_uint          num_events_in_wait_list,
+                         const cl_event  *event_wait_list,
+                         cl_event        *event,
+                         cl_int          *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clEnqueueMapBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list, event_wait_list, event, errcode_ret);
+        return func(command_queue, buffer, blocking_map, map_flags, offset, size, num_events_in_wait_list,
+                    event_wait_list, event, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -670,7 +664,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseCommandQueue_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -680,24 +674,23 @@
     }
 }
 
-cl_program clCreateProgramWithBinary(
-    cl_context            context,
-    cl_uint               num_devices,
-    const cl_device_id   *device_list,
-    const size_t         *lengths,
-    const unsigned char **binaries,
-    cl_int               *binary_status,
-    cl_int               *errcode_ret)
+cl_program clCreateProgramWithBinary(cl_context            context,
+                                     cl_uint               num_devices,
+                                     const cl_device_id   *device_list,
+                                     const size_t         *lengths,
+                                     const unsigned char **binaries,
+                                     cl_int               *binary_status,
+                                     cl_int               *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateProgramWithBinary_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, num_devices, device_list, lengths, binaries, binary_status, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -709,7 +702,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainContext_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context);
     }
@@ -723,7 +716,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseProgram_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program);
     }
@@ -737,7 +730,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clFlush_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -751,7 +744,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clFinish_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue);
     }
@@ -761,16 +754,15 @@
     }
 }
 
-cl_int clGetProgramInfo(
-    cl_program      program,
-    cl_program_info param_name,
-    size_t          param_value_size,
-    void           *param_value,
-    size_t         *param_value_size_ret)
+cl_int clGetProgramInfo(cl_program      program,
+                        cl_program_info param_name,
+                        size_t          param_value_size,
+                        void           *param_value,
+                        size_t         *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetProgramInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -780,20 +772,17 @@
     }
 }
 
-cl_kernel clCreateKernel(
-    cl_program  program,
-    const char *kernel_name,
-    cl_int     *errcode_ret)
+cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(program, kernel_name, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -805,7 +794,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel);
     }
@@ -815,22 +804,17 @@
     }
 }
 
-cl_mem clCreateBuffer(
-    cl_context   context,
-    cl_mem_flags flags,
-    size_t       size,
-    void        *host_ptr,
-    cl_int      *errcode_ret)
+cl_mem clCreateBuffer(cl_context context, cl_mem_flags flags, size_t size, void *host_ptr, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateBuffer_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, size, host_ptr, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -839,21 +823,17 @@
 }
 
 cl_program clCreateProgramWithSource(
-    cl_context    context,
-    cl_uint       count,
-    const char **strings,
-    const size_t *lengths,
-    cl_int       *errcode_ret)
+    cl_context context, cl_uint count, const char **strings, const size_t *lengths, cl_int *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateProgramWithSource_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, count, strings, lengths, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -865,7 +845,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clReleaseKernel_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel);
     }
@@ -878,12 +858,12 @@
 cl_int clGetDeviceIDs(cl_platform_id platform,
                       cl_device_type device_type,
                       cl_uint        num_entries,
-                      cl_device_id *devices,
+                      cl_device_id  *devices,
                       cl_uint       *num_devices)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetDeviceIDs_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(platform, device_type, num_entries, devices, num_devices);
     }
@@ -901,7 +881,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetDeviceInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -911,15 +891,12 @@
     }
 }
 
-cl_int clGetMemObjectInfo(cl_mem      memobj,
-                          cl_mem_info param_name,
-                          size_t      param_value_size,
-                          void       *param_value,
-                          size_t     *param_value_size_ret)
+cl_int clGetMemObjectInfo(
+    cl_mem memobj, cl_mem_info param_name, size_t param_value_size, void *param_value, size_t *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetMemObjectInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(memobj, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -933,7 +910,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clRetainEvent_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event);
     }
@@ -951,7 +928,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetPlatformInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(platform, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -965,7 +942,7 @@
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetPlatformIDs_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_entries, platforms, num_platforms);
     }
@@ -975,17 +952,16 @@
     }
 }
 
-cl_int
-clGetKernelWorkGroupInfo(cl_kernel                 kernel,
-                         cl_device_id              device,
-                         cl_kernel_work_group_info param_name,
-                         size_t                    param_value_size,
-                         void                     *param_value,
-                         size_t                   *param_value_size_ret)
+cl_int clGetKernelWorkGroupInfo(cl_kernel                 kernel,
+                                cl_device_id              device,
+                                cl_kernel_work_group_info param_name,
+                                size_t                    param_value_size,
+                                void                     *param_value,
+                                size_t                   *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetKernelWorkGroupInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, device, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -995,16 +971,15 @@
     }
 }
 
-cl_int
-clGetCommandQueueInfo(cl_command_queue      command_queue,
-                      cl_command_queue_info param_name,
-                      size_t                param_value_size,
-                      void                 *param_value,
-                      size_t               *param_value_size_ret)
+cl_int clGetCommandQueueInfo(cl_command_queue      command_queue,
+                             cl_command_queue_info param_name,
+                             size_t                param_value_size,
+                             void                 *param_value,
+                             size_t               *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetCommandQueueInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_queue, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -1014,16 +989,15 @@
     }
 }
 
-cl_int
-clGetKernelInfo(cl_kernel      kernel,
-                cl_kernel_info param_name,
-                size_t         param_value_size,
-                void          *param_value,
-                size_t        *param_value_size_ret)
+cl_int clGetKernelInfo(cl_kernel      kernel,
+                       cl_kernel_info param_name,
+                       size_t         param_value_size,
+                       void          *param_value,
+                       size_t        *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetKernelInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -1033,16 +1007,15 @@
     }
 }
 
-cl_int
-clGetEventProfilingInfo(cl_event          event,
-                        cl_profiling_info param_name,
-                        size_t            param_value_size,
-                        void             *param_value,
-                        size_t           *param_value_size_ret)
+cl_int clGetEventProfilingInfo(cl_event          event,
+                               cl_profiling_info param_name,
+                               size_t            param_value_size,
+                               void             *param_value,
+                               size_t           *param_value_size_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clGetEventProfilingInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(event, param_name, param_value_size, param_value, param_value_size_ret);
     }
@@ -1052,23 +1025,22 @@
     }
 }
 
-cl_mem
-clCreateImage(cl_context             context,
-              cl_mem_flags           flags,
-              const cl_image_format *image_format,
-              const cl_image_desc   *image_desc,
-              void                  *host_ptr,
-              cl_int                *errcode_ret)
+cl_mem clCreateImage(cl_context             context,
+                     cl_mem_flags           flags,
+                     const cl_image_format *image_format,
+                     const cl_image_desc   *image_desc,
+                     void                  *host_ptr,
+                     cl_int                *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clCreateImage_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, image_format, image_desc, host_ptr, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }
@@ -1076,14 +1048,12 @@
     }
 }
 
-cl_int clSetKernelExecInfo(cl_kernel           kernel,
-                           cl_kernel_exec_info param_name,
-                           size_t              param_value_size,
-                           const void         *param_value)
+cl_int
+clSetKernelExecInfo(cl_kernel kernel, cl_kernel_exec_info param_name, size_t param_value_size, const void *param_value)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clSetKernelExecInfo_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(kernel, param_name, param_value_size, param_value);
     }
@@ -1093,22 +1063,21 @@
     }
 }
 
-cl_command_buffer_khr clCreateCommandBufferKHR(
-    cl_uint num_queues,
-    const cl_command_queue* queues,
-    const cl_command_buffer_properties_khr* properties,
-    cl_int* errcode_ret)
+cl_command_buffer_khr clCreateCommandBufferKHR(cl_uint                                 num_queues,
+                                               const cl_command_queue                 *queues,
+                                               const cl_command_buffer_properties_khr *properties,
+                                               cl_int                                 *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clCreateCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_queues, queues, properties, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_INVALID_OPERATION;
         }
@@ -1122,7 +1091,7 @@
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clFinalizeCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_buffer);
     }
@@ -1137,7 +1106,7 @@
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clRetainCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_buffer);
     }
@@ -1152,7 +1121,7 @@
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clReleaseCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_buffer);
     }
@@ -1162,18 +1131,17 @@
     }
 }
 
-cl_int clEnqueueCommandBufferKHR(
-    cl_uint num_queues,
-    cl_command_queue* queues,
-    cl_command_buffer_khr command_buffer,
-    cl_uint num_events_in_wait_list,
-    const cl_event* event_wait_list,
-    cl_event* event)
+cl_int clEnqueueCommandBufferKHR(cl_uint               num_queues,
+                                 cl_command_queue     *queues,
+                                 cl_command_buffer_khr command_buffer,
+                                 cl_uint               num_events_in_wait_list,
+                                 const cl_event       *event_wait_list,
+                                 cl_event             *event)
 {
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clEnqueueCommandBufferKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(num_queues, queues, command_buffer, num_events_in_wait_list, event_wait_list, event);
     }
@@ -1183,27 +1151,26 @@
     }
 }
 
-
-cl_int clCommandNDRangeKernelKHR(
-    cl_command_buffer_khr command_buffer,
-    cl_command_queue command_queue,
-    const cl_ndrange_kernel_command_properties_khr* properties,
-    cl_kernel kernel,
-    cl_uint work_dim,
-    const size_t* global_work_offset,
-    const size_t* global_work_size,
-    const size_t* local_work_size,
-    cl_uint num_sync_points_in_wait_list,
-    const cl_sync_point_khr* sync_point_wait_list,
-    cl_sync_point_khr* sync_point,
-    cl_mutable_command_khr* mutable_handle)
+cl_int clCommandNDRangeKernelKHR(cl_command_buffer_khr                           command_buffer,
+                                 cl_command_queue                                command_queue,
+                                 const cl_ndrange_kernel_command_properties_khr *properties,
+                                 cl_kernel                                       kernel,
+                                 cl_uint                                         work_dim,
+                                 const size_t                                   *global_work_offset,
+                                 const size_t                                   *global_work_size,
+                                 const size_t                                   *local_work_size,
+                                 cl_uint                                         num_sync_points_in_wait_list,
+                                 const cl_sync_point_khr                        *sync_point_wait_list,
+                                 cl_sync_point_khr                              *sync_point,
+                                 cl_mutable_command_khr                         *mutable_handle)
 {
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clCommandNDRangeKernelKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
-        return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle);
+        return func(command_buffer, command_queue, properties, kernel, work_dim, global_work_offset, global_work_size,
+                    local_work_size, num_sync_points_in_wait_list, sync_point_wait_list, sync_point, mutable_handle);
     }
     else
     {
@@ -1211,14 +1178,13 @@
     }
 }
 
-cl_int clUpdateMutableCommandsKHR(
-    cl_command_buffer_khr command_buffer,
-    const cl_mutable_base_config_khr* mutable_config)
+cl_int clUpdateMutableCommandsKHR(cl_command_buffer_khr             command_buffer,
+                                  const cl_mutable_base_config_khr *mutable_config)
 {
     arm_compute::CLSymbols::get().load_default();
     const auto func = arm_compute::CLSymbols::get().clUpdateMutableCommandsKHR_ptr;
 
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(command_buffer, mutable_config);
     }
@@ -1228,23 +1194,22 @@
     }
 }
 
-cl_mem
-clImportMemoryARM(cl_context                      context,
-                  cl_mem_flags                    flags,
-                  const cl_import_properties_arm *properties,
-                  void                           *memory,
-                  size_t                          size,
-                  cl_int                         *errcode_ret)
+cl_mem clImportMemoryARM(cl_context                      context,
+                         cl_mem_flags                    flags,
+                         const cl_import_properties_arm *properties,
+                         void                           *memory,
+                         size_t                          size,
+                         cl_int                         *errcode_ret)
 {
     arm_compute::CLSymbols::get().load_default();
     auto func = arm_compute::CLSymbols::get().clImportMemoryARM_ptr;
-    if(func != nullptr)
+    if (func != nullptr)
     {
         return func(context, flags, properties, memory, size, errcode_ret);
     }
     else
     {
-        if(errcode_ret != nullptr)
+        if (errcode_ret != nullptr)
         {
             *errcode_ret = CL_OUT_OF_RESOURCES;
         }

diff --git a/src/core/CL/cl_kernels/activation_float_helpers.h b/src/core/CL/cl_kernels/activation_float_helpers.h
index 3f93c8d..02faae2 100644
--- a/src/core/CL/cl_kernels/activation_float_helpers.h
+++ b/src/core/CL/cl_kernels/activation_float_helpers.h

@@ -31,7 +31,8 @@
 #endif // GPU_ARCH == GPU_ARCH_BIFROST
 
 // Hard-Swish
-#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
+#define hard_swish_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * ((min(max((x + (DATA_TYPE)3.0), (DATA_TYPE)0.0), (DATA_TYPE)6.0)) * (DATA_TYPE)0.166666667))
 
 // Logistic Activation
 #define logistic_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((DATA_TYPE)1.0 / ((DATA_TYPE)1.0 + exp(-x)))
@@ -49,13 +50,16 @@
 #define lu_brelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
 
 // Leaky RELU Activation
-#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
+#define lrelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    ((min(x, (DATA_TYPE)0.0) * (DATA_TYPE)A_VAL) + max(x, (DATA_TYPE)0.0))
 
 // Soft RELU Activation
 #define srelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (log((DATA_TYPE)1.0 + exp(x)))
 
 // ELU Activation
-#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
+#define elu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL)           \
+    (select(((DATA_TYPE)A_VAL * (exp(x) - (DATA_TYPE)1.0)), x, \
+            (SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))isgreaterequal(x, (DATA_TYPE)0.0)))
 
 // Absolute Activation
 #define abs_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (fabs(x))
@@ -70,7 +74,8 @@
 #define linear_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (MLA((DATA_TYPE)B_VAL, (DATA_TYPE)A_VAL, x))
 
 // GELU Activation
-#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
+#define gelu_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) \
+    (x * (DATA_TYPE)0.5 * ((DATA_TYPE)1.0 + erf(x / (DATA_TYPE)1.41421356237)))
 
 // Identity Activation
 #define identity_op(DATA_TYPE, VEC_SIZE, x, A_VAL, B_VAL) (x)

diff --git a/src/core/CL/cl_kernels/activation_quant_helpers.h b/src/core/CL/cl_kernels/activation_quant_helpers.h
index c420578..c758ff1 100644
--- a/src/core/CL/cl_kernels/activation_quant_helpers.h
+++ b/src/core/CL/cl_kernels/activation_quant_helpers.h

@@ -60,17 +60,17 @@
 }
 
 #define ACTIVATION_OP2(op, x) op##_op(x)
-#define ACTIVATION_OP(op, x) ACTIVATION_OP2(op, x)
+#define ACTIVATION_OP(op, x)  ACTIVATION_OP2(op, x)
 
 #if defined(S1_VAL) && defined(S2_VAL)
 #if defined(O1_VAL) && defined(O2_VAL)
 #define PERFORM_ACTIVATION_QUANT(act, data)                                                       \
     ({                                                                                            \
         data = ACTIVATION_OP(act, data);                                                          \
-        \
+                                                                                                  \
         VEC_DATA_TYPE(float, VEC_SIZE)                                                            \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));                                    \
-        \
+                                                                                                  \
         fdata = round((fdata - (float)O1_VAL) * ((float)S1_VAL / (float)S2_VAL) + (float)O2_VAL); \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));                           \
     })
@@ -78,17 +78,14 @@
 #define PERFORM_ACTIVATION_QUANT(act, data)                             \
     ({                                                                  \
         data = ACTIVATION_OP(act, data);                                \
-        \
+                                                                        \
         VEC_DATA_TYPE(float, VEC_SIZE)                                  \
         fdata = CONVERT(data, VEC_DATA_TYPE(float, VEC_SIZE));          \
-        \
+                                                                        \
         fdata = round((fdata) * ((float)S1_VAL / (float)S2_VAL));       \
         data  = CONVERT_SAT(fdata, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); \
     })
 #endif /* defined(O1_VAL) && defined(O2_VAL) */
 #else  /* defined(S1_VAL) && defined(S2_VAL) */
-#define PERFORM_ACTIVATION_QUANT(act, data) \
-    ({                                      \
-        data = ACTIVATION_OP(act, data);    \
-    })
+#define PERFORM_ACTIVATION_QUANT(act, data) ({ data = ACTIVATION_OP(act, data); })
 #endif /* defined(S1_VAL) && defined(S2_VAL) */

diff --git a/src/core/CL/cl_kernels/gemm_helpers.h b/src/core/CL/cl_kernels/gemm_helpers.h
index 0e938cb..4bef023 100644
--- a/src/core/CL/cl_kernels/gemm_helpers.h
+++ b/src/core/CL/cl_kernels/gemm_helpers.h

@@ -34,14 +34,14 @@
  *
  */
 #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
-#define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
+#define SCALAR_ACCESS(offset, n0, x)     SCALAR_ACCESS_STR(offset, n0, x)
 
 // offset == 0
-#define scalar_access_0_1(x) ((x).s0)
-#define scalar_access_0_2(x) ((x).s01)
-#define scalar_access_0_3(x) ((x).s012)
-#define scalar_access_0_4(x) ((x).s0123)
-#define scalar_access_0_8(x) ((x).s01234567)
+#define scalar_access_0_1(x)  ((x).s0)
+#define scalar_access_0_2(x)  ((x).s01)
+#define scalar_access_0_3(x)  ((x).s012)
+#define scalar_access_0_4(x)  ((x).s0123)
+#define scalar_access_0_8(x)  ((x).s01234567)
 #define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
 
 // offset == 1
@@ -100,8 +100,7 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
-    ({})
+#define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) ({})
 
 #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
     SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
@@ -186,8 +185,10 @@
  * @param[in] Z          The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
-#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
+#define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
+    LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_TENSOR
 
 /** Load 2D tensor (consecutive rows and columns) with Z offset.
@@ -202,8 +203,7 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
-    ({})
+#define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) ({})
 
 #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
     LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
@@ -279,8 +279,10 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @}*/ // end of group LOAD_TENSOR_M0XN0
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -395,8 +397,10 @@
  * @param[in] Z         The z-axis offset vector
  * @{
  */
-#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 /** @} */ // end of group LOAD_BLOCK
 
 /** Partially load the 0 to (n-1)th rows of the given variables
@@ -517,8 +521,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
-#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
+#define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
+    LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 /** Load a block that can be partial in both x and y dimensions
  *
  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
@@ -541,22 +547,23 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                                   \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                                           \
-    }                                                                                                                                                            \
-    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                               \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                            \
-    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                               \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                            \
-    else                                                                                                                                                         \
-    {                                                                                                                                                            \
-        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                               \
+#define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                      PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                            \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                    \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                             \
+    }                                                                                                              \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);               \
+    }                                                                                                              \
+    else                                                                                                           \
+    {                                                                                                              \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
     }
 /** Load a block that can only be partial in x but not y.
  *
@@ -578,14 +585,15 @@
  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X))                                                                                                \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
-    }                                                                                                                    \
-    else                                                                                                                 \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
+#define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, \
+                                PARTIAL_COND_X)                                                          \
+    if (!(PARTIAL_COND_X))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
     }
 /** Load a block that can only be partial in y but not x.
  *
@@ -607,14 +615,15 @@
  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  */
-#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
-    if(!(PARTIAL_COND_Y))                                                                                                \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                                   \
-    }                                                                                                                    \
-    else                                                                                                                 \
-    {                                                                                                                    \
-        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                     \
+#define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                PARTIAL_COND_Y)                                                          \
+    if (!(PARTIAL_COND_Y))                                                                               \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);                   \
+    }                                                                                                    \
+    else                                                                                                 \
+    {                                                                                                    \
+        LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z);     \
     }
 /** @} */ // end of group LOAD_BLOCK_PARTIAL
 /** Boundary-aware GeMM block load
@@ -676,28 +685,33 @@
  */
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
     LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
 
 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
 // Case2: Partial blocks in y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
     LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
 
 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
 // Case3: Partial blocks in x
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
     LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
 
 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case4: Partial blocks in both x and y
-#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                                                                                 \
-    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                        \
+    REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0);                               \
+    LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, \
+                                  PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
 
-#endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
+#endif    // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 /** @} */ // end of group LOAD_BLOCK_BOUNDARY_AWARE
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -795,8 +809,10 @@
  * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
  * @{
  */
-#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
-#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
+#define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
+    LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
 /** @} */ // end of group LOAD_TEXTURE2D
 
 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded.
@@ -815,7 +831,7 @@
 #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)      \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##0;                                                                            \
-    if(Y_MASK##0 != 0)                                                                      \
+    if (Y_MASK##0 != 0)                                                                     \
         BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##0 = 0;
@@ -824,7 +840,7 @@
     LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##1;                                                                            \
-    if(Y_MASK##1 != 0)                                                                      \
+    if (Y_MASK##1 != 0)                                                                     \
         BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##1 = 0;
@@ -833,7 +849,7 @@
     LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##2;                                                                            \
-    if(Y_MASK##2 != 0)                                                                      \
+    if (Y_MASK##2 != 0)                                                                     \
         BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##2 = 0;
@@ -842,7 +858,7 @@
     LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##3;                                                                            \
-    if(Y_MASK##3 != 0)                                                                      \
+    if (Y_MASK##3 != 0)                                                                     \
         BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##3 = 0;
@@ -851,7 +867,7 @@
     LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##4;                                                                            \
-    if(Y_MASK##4 != 0)                                                                      \
+    if (Y_MASK##4 != 0)                                                                     \
         BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##4 = 0;
@@ -860,7 +876,7 @@
     LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##5;                                                                            \
-    if(Y_MASK##5 != 0)                                                                      \
+    if (Y_MASK##5 != 0)                                                                     \
         BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##5 = 0;
@@ -869,7 +885,7 @@
     LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##6;                                                                            \
-    if(Y_MASK##6 != 0)                                                                      \
+    if (Y_MASK##6 != 0)                                                                     \
         BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##6 = 0;
@@ -878,7 +894,7 @@
     LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##7;                                                                            \
-    if(Y_MASK##7 != 0)                                                                      \
+    if (Y_MASK##7 != 0)                                                                     \
         BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##7 = 0;
@@ -887,7 +903,7 @@
     LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##8;                                                                            \
-    if(Y_MASK##8 != 0)                                                                      \
+    if (Y_MASK##8 != 0)                                                                     \
         BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##8 = 0;
@@ -896,7 +912,7 @@
     LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)          \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##9;                                                                            \
-    if(Y_MASK##9 != 0)                                                                      \
+    if (Y_MASK##9 != 0)                                                                     \
         BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##9 = 0;
@@ -905,7 +921,7 @@
     LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##A;                                                                            \
-    if(Y_MASK##A != 0)                                                                      \
+    if (Y_MASK##A != 0)                                                                     \
         BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##A = 0;
@@ -914,7 +930,7 @@
     LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##B;                                                                            \
-    if(Y_MASK##B != 0)                                                                      \
+    if (Y_MASK##B != 0)                                                                     \
         BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##B = 0;
@@ -923,7 +939,7 @@
     LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##C;                                                                            \
-    if(Y_MASK##C != 0)                                                                      \
+    if (Y_MASK##C != 0)                                                                     \
         BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##C = 0;
@@ -932,7 +948,7 @@
     LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##D;                                                                            \
-    if(Y_MASK##D != 0)                                                                      \
+    if (Y_MASK##D != 0)                                                                     \
         BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##D = 0;
@@ -941,7 +957,7 @@
     LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##E;                                                                            \
-    if(Y_MASK##E != 0)                                                                      \
+    if (Y_MASK##E != 0)                                                                     \
         BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##E = 0;
@@ -950,7 +966,7 @@
     LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)         \
     VEC_DATA_TYPE(DATA_TYPE, N0)                                                            \
     BASENAME##F;                                                                            \
-    if(Y_MASK##F != 0)                                                                      \
+    if (Y_MASK##F != 0)                                                                     \
         BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
     else                                                                                    \
         BASENAME##F = 0;
@@ -976,8 +992,10 @@
  * @param[in] Y_MASK    The y-axis mask vector. If 0, forces BASENAMEn to 0
  * @{
  */
-#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
-#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
+#define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
+    LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
 /** @} */ // end of group LOAD_BLOCK_INDIRECT
 
 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
@@ -1088,8 +1106,10 @@
  * @param[in] STRIDE_Y  The stride in y-axis direction
  * @{
  */
-#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
-#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
+#define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
+    LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
 /** @} */ // end of group LOAD_SCALAR_AS_VECTOR
 
 /** Basic macros to calculate Z offset values from Z0 to Zn-1
@@ -1187,8 +1207,10 @@
  * @param[in] STRIDE_Y        The stride value in y-axis direction
  * @{
  */
-#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
-#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
+#define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
+    CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
 /** @} */ // end of group CALCULATE_Z_OFFSET
 
 /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
@@ -1199,8 +1221,7 @@
  * @param[in] SCALE     The scale factor
  * @{
  */
-#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
-    BASENAME##0 *= (DATA_TYPE)SCALE;
+#define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) BASENAME##0 *= (DATA_TYPE)SCALE;
 
 #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
     SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE)     \
@@ -1275,7 +1296,7 @@
  * @{
  */
 #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
-#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
+#define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE)     SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
 /** @} */ // end of group SCALE_BLOCK
 
 /** Create a new vector containing the values at the given index for a set of given vectors
@@ -1287,8 +1308,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
+#define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
 #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                         \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
@@ -1297,13 +1317,20 @@
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
 #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 4)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
-#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 8)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
-#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                         \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
+    BASENAME##IDX_COL =                            \
+        (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
+#define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 8)                                                                                   \
+    BASENAME##IDX_COL =                                                                                      \
+        (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                 (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
+#define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE)                                                           \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                   \
+    BASENAME##IDX_COL =                                                                                       \
+        (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, \
+                                  (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, \
+                                  (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, \
+                                  (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
 /** @} */ // end of group COLUMN_VECTORn
 
 /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
@@ -1315,8 +1342,7 @@
  * @param[in] TYPE     The data type of the destination vectors
  * @{
  */
-#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
-    TYPE BASENAME##IDX_COL = (TYPE)((X##0));
+#define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) TYPE BASENAME##IDX_COL = (TYPE)((X##0));
 #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 2)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
@@ -1329,9 +1355,10 @@
 #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
     VEC_DATA_TYPE(TYPE, 8)                                \
     BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
-#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
-    VEC_DATA_TYPE(TYPE, 16)                                \
-    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
+#define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE)                                                        \
+    VEC_DATA_TYPE(TYPE, 16)                                                                                       \
+    BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), \
+                                                  (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
 /** @} */ // end of group COLUMN_VECTOR_SCALARn
 
 /** Create transposed vectors of the given vectors
@@ -1343,8 +1370,7 @@
  * @param[in] TYPE     The data type of the transposed vectors
  * @{
  */
-#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
-    COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
+#define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
 #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
     COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE);  \
     COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
@@ -1417,8 +1443,7 @@
  * @param[in] BIAS     The basename of the added variables
  * @{
  */
-#define ADD_ROW_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS##0;
+#define ADD_ROW_1(BASENAME, BIAS) BASENAME##0 += BIAS##0;
 
 #define ADD_ROW_2(BASENAME, BIAS) \
     ADD_ROW_1(BASENAME, BIAS)     \
@@ -1493,7 +1518,7 @@
  * @{
  */
 #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
-#define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK(N, BASENAME, BIAS)     ADD_BLOCK_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK
 
 /** Broadcast (add single value) to the each element of the destination variables
@@ -1503,8 +1528,7 @@
  * @param[in] BIAS     The variable containing the value to add
  * @{
  */
-#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
-    BASENAME##0 += BIAS;
+#define ADD_ROW_BROADCAST_1(BASENAME, BIAS) BASENAME##0 += BIAS;
 
 #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
     ADD_ROW_BROADCAST_1(BASENAME, BIAS)     \
@@ -1578,7 +1602,7 @@
  * @{
  */
 #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
-#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
+#define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS)     ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
 /** @} */ // end of group ADD_BLOCK_BROADCAST
 
 /** Apply activation to the given variables
@@ -1668,8 +1692,10 @@
  * @param[in] B_VAL           Additional value required by the activation
  * @{
  */
-#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
-#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
+#define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
+    ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
 /** @} */ // end of group ACTIVATION_BLOCK
 
 /** Apply convert_<data_type> to the given variables
@@ -1773,6 +1799,8 @@
  * @param[in] BASENAME_DST The basename of the destination variables
  * @{
  */
-#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
-#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
+#define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
+    CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
 /** @} */ // end of group CONVERT_BLOCK

diff --git a/src/core/CL/cl_kernels/helpers.h b/src/core/CL/cl_kernels/helpers.h
index b2ceaf9..87a1875 100644
--- a/src/core/CL/cl_kernels/helpers.h
+++ b/src/core/CL/cl_kernels/helpers.h

@@ -81,11 +81,11 @@
  * @return The reversed vector
  * @{
  */
-#define REV1(x) ((x))
-#define REV2(x) ((x).s10)
-#define REV3(x) ((x).s210)
-#define REV4(x) ((x).s3210)
-#define REV8(x) ((x).s76543210)
+#define REV1(x)  ((x))
+#define REV2(x)  ((x).s10)
+#define REV3(x)  ((x).s210)
+#define REV4(x)  ((x).s3210)
+#define REV8(x)  ((x).s76543210)
 #define REV16(x) ((x).sFEDCBA9876543210)
 /** @} */ // end of group REVn
 
@@ -99,7 +99,7 @@
  * @{
  */
 #define REVERSE_STR(x, s) REV##s((x))
-#define REVERSE(x, s) REVERSE_STR(x, s)
+#define REVERSE(x, s)     REVERSE_STR(x, s)
 /** @} */ // end of group REVERSE
 
 /** Circular-right-shift (rotate-right) the vector of size s by the amount of n.
@@ -138,16 +138,16 @@
 #define ROT8_7(x) ((x).s12345670)
 #define ROT8_8(x) ((x))
 
-#define ROT16_0(x) ((x))
-#define ROT16_1(x) ((x).sF0123456789ABCDE)
-#define ROT16_2(x) ((x).sEF0123456789ABCD)
-#define ROT16_3(x) ((x).sDEF0123456789ABC)
-#define ROT16_4(x) ((x).sCDEF0123456789AB)
-#define ROT16_5(x) ((x).sBCDEF0123456789A)
-#define ROT16_6(x) ((x).sABCDEF0123456789)
-#define ROT16_7(x) ((x).s9ABCDEF012345678)
-#define ROT16_8(x) ((x).s89ABCDEF01234567)
-#define ROT16_9(x) ((x).s789ABCDEF0123456)
+#define ROT16_0(x)  ((x))
+#define ROT16_1(x)  ((x).sF0123456789ABCDE)
+#define ROT16_2(x)  ((x).sEF0123456789ABCD)
+#define ROT16_3(x)  ((x).sDEF0123456789ABC)
+#define ROT16_4(x)  ((x).sCDEF0123456789AB)
+#define ROT16_5(x)  ((x).sBCDEF0123456789A)
+#define ROT16_6(x)  ((x).sABCDEF0123456789)
+#define ROT16_7(x)  ((x).s9ABCDEF012345678)
+#define ROT16_8(x)  ((x).s89ABCDEF01234567)
+#define ROT16_9(x)  ((x).s789ABCDEF0123456)
 #define ROT16_10(x) ((x).s6789ABCDEF012345)
 #define ROT16_11(x) ((x).s56789ABCDEF01234)
 #define ROT16_12(x) ((x).s456789ABCDEF0123)
@@ -168,7 +168,7 @@
  * @{
  */
 #define ROTATE_STR(x, s, n) ROT##s##_##n(x)
-#define ROTATE(x, s, n) ROTATE_STR(x, s, n)
+#define ROTATE(x, s, n)     ROTATE_STR(x, s, n)
 /** @} */ // end of group ROTATE
 
 /** Creates a vector of size n filled with offset values corresponding to the location of each element.
@@ -179,11 +179,11 @@
  * @return The vector filled with offset values
  * @{
  */
-#define V_OFFS1(dt) (dt##1)(0)
-#define V_OFFS2(dt) (dt##2)(0, 1)
-#define V_OFFS3(dt) (dt##3)(0, 1, 2)
-#define V_OFFS4(dt) (dt##4)(0, 1, 2, 3)
-#define V_OFFS8(dt) (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
+#define V_OFFS1(dt)  (dt##1)(0)
+#define V_OFFS2(dt)  (dt##2)(0, 1)
+#define V_OFFS3(dt)  (dt##3)(0, 1, 2)
+#define V_OFFS4(dt)  (dt##4)(0, 1, 2, 3)
+#define V_OFFS8(dt)  (dt##8)(0, 1, 2, 3, 4, 5, 6, 7)
 #define V_OFFS16(dt) (dt##16)(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)
 /** @} */ // end of group V_OFFSn
 
@@ -197,11 +197,11 @@
  * @{
  */
 #define VEC_OFFS_STR(dt, s) V_OFFS##s(dt)
-#define VEC_OFFS(dt, s) VEC_OFFS_STR(dt, s)
+#define VEC_OFFS(dt, s)     VEC_OFFS_STR(dt, s)
 /** @} */ // end of group VEC_OFFS
 
 #define VLOAD_STR(size) vload##size
-#define VLOAD(size) VLOAD_STR(size)
+#define VLOAD(size)     VLOAD_STR(size)
 
 /** Extended partial vload that correctly handles scalar values as well.
  * Load the **lower** 0 to (n-1)th elements of the given vector while minimising the amount of load ops
@@ -219,23 +219,23 @@
  * @{
  */
 #define VLOAD_PARTIAL_STR(size, load_size) vload_partial_##size##_##load_size
-#define VLOAD_PARTIAL(size, load_size) VLOAD_PARTIAL_STR(size, load_size)
+#define VLOAD_PARTIAL(size, load_size)     VLOAD_PARTIAL_STR(size, load_size)
 
 #define NO_LOAD(data, offs, ptr) \
     {                            \
     }
 
 // Size == 1 (scalar)
-#define vload_partial_1_0 NO_LOAD
-#define vload_partial_1_1 vload1
-#define vload_partial_1_2 NO_LOAD
-#define vload_partial_1_3 NO_LOAD
-#define vload_partial_1_4 NO_LOAD
-#define vload_partial_1_5 NO_LOAD
-#define vload_partial_1_6 NO_LOAD
-#define vload_partial_1_7 NO_LOAD
-#define vload_partial_1_8 NO_LOAD
-#define vload_partial_1_9 NO_LOAD
+#define vload_partial_1_0  NO_LOAD
+#define vload_partial_1_1  vload1
+#define vload_partial_1_2  NO_LOAD
+#define vload_partial_1_3  NO_LOAD
+#define vload_partial_1_4  NO_LOAD
+#define vload_partial_1_5  NO_LOAD
+#define vload_partial_1_6  NO_LOAD
+#define vload_partial_1_7  NO_LOAD
+#define vload_partial_1_8  NO_LOAD
+#define vload_partial_1_9  NO_LOAD
 #define vload_partial_1_10 NO_LOAD
 #define vload_partial_1_11 NO_LOAD
 #define vload_partial_1_12 NO_LOAD
@@ -244,16 +244,16 @@
 #define vload_partial_1_15 NO_LOAD
 #define vload_partial_1_16 NO_LOAD
 // Size == 2
-#define vload_partial_2_0 NO_LOAD
-#define vload_partial_2_1 vload_partial_1
-#define vload_partial_2_2 vload_partial_2
-#define vload_partial_2_3 NO_LOAD
-#define vload_partial_2_4 NO_LOAD
-#define vload_partial_2_5 NO_LOAD
-#define vload_partial_2_6 NO_LOAD
-#define vload_partial_2_7 NO_LOAD
-#define vload_partial_2_8 NO_LOAD
-#define vload_partial_2_9 NO_LOAD
+#define vload_partial_2_0  NO_LOAD
+#define vload_partial_2_1  vload_partial_1
+#define vload_partial_2_2  vload_partial_2
+#define vload_partial_2_3  NO_LOAD
+#define vload_partial_2_4  NO_LOAD
+#define vload_partial_2_5  NO_LOAD
+#define vload_partial_2_6  NO_LOAD
+#define vload_partial_2_7  NO_LOAD
+#define vload_partial_2_8  NO_LOAD
+#define vload_partial_2_9  NO_LOAD
 #define vload_partial_2_10 NO_LOAD
 #define vload_partial_2_11 NO_LOAD
 #define vload_partial_2_12 NO_LOAD
@@ -262,16 +262,16 @@
 #define vload_partial_2_15 NO_LOAD
 #define vload_partial_2_16 NO_LOAD
 // Size == 3
-#define vload_partial_3_0 NO_LOAD
-#define vload_partial_3_1 vload_partial_1
-#define vload_partial_3_2 vload_partial_2
-#define vload_partial_3_3 vload_partial_3
-#define vload_partial_3_4 NO_LOAD
-#define vload_partial_3_5 NO_LOAD
-#define vload_partial_3_6 NO_LOAD
-#define vload_partial_3_7 NO_LOAD
-#define vload_partial_3_8 NO_LOAD
-#define vload_partial_3_9 NO_LOAD
+#define vload_partial_3_0  NO_LOAD
+#define vload_partial_3_1  vload_partial_1
+#define vload_partial_3_2  vload_partial_2
+#define vload_partial_3_3  vload_partial_3
+#define vload_partial_3_4  NO_LOAD
+#define vload_partial_3_5  NO_LOAD
+#define vload_partial_3_6  NO_LOAD
+#define vload_partial_3_7  NO_LOAD
+#define vload_partial_3_8  NO_LOAD
+#define vload_partial_3_9  NO_LOAD
 #define vload_partial_3_10 NO_LOAD
 #define vload_partial_3_11 NO_LOAD
 #define vload_partial_3_12 NO_LOAD
@@ -280,16 +280,16 @@
 #define vload_partial_3_15 NO_LOAD
 #define vload_partial_3_16 NO_LOAD
 // Size == 4
-#define vload_partial_4_0 NO_LOAD
-#define vload_partial_4_1 vload_partial_1
-#define vload_partial_4_2 vload_partial_2
-#define vload_partial_4_3 vload_partial_3
-#define vload_partial_4_4 vload_partial_4
-#define vload_partial_4_5 NO_LOAD
-#define vload_partial_4_6 NO_LOAD
-#define vload_partial_4_7 NO_LOAD
-#define vload_partial_4_8 NO_LOAD
-#define vload_partial_4_9 NO_LOAD
+#define vload_partial_4_0  NO_LOAD
+#define vload_partial_4_1  vload_partial_1
+#define vload_partial_4_2  vload_partial_2
+#define vload_partial_4_3  vload_partial_3
+#define vload_partial_4_4  vload_partial_4
+#define vload_partial_4_5  NO_LOAD
+#define vload_partial_4_6  NO_LOAD
+#define vload_partial_4_7  NO_LOAD
+#define vload_partial_4_8  NO_LOAD
+#define vload_partial_4_9  NO_LOAD
 #define vload_partial_4_10 NO_LOAD
 #define vload_partial_4_11 NO_LOAD
 #define vload_partial_4_12 NO_LOAD
@@ -298,16 +298,16 @@
 #define vload_partial_4_15 NO_LOAD
 #define vload_partial_4_16 NO_LOAD
 // Size == 8
-#define vload_partial_8_0 NO_LOAD
-#define vload_partial_8_1 vload_partial_1
-#define vload_partial_8_2 vload_partial_2
-#define vload_partial_8_3 vload_partial_3
-#define vload_partial_8_4 vload_partial_4
-#define vload_partial_8_5 vload_partial_5
-#define vload_partial_8_6 vload_partial_6
-#define vload_partial_8_7 vload_partial_7
-#define vload_partial_8_8 vload_partial_8
-#define vload_partial_8_9 NO_LOAD
+#define vload_partial_8_0  NO_LOAD
+#define vload_partial_8_1  vload_partial_1
+#define vload_partial_8_2  vload_partial_2
+#define vload_partial_8_3  vload_partial_3
+#define vload_partial_8_4  vload_partial_4
+#define vload_partial_8_5  vload_partial_5
+#define vload_partial_8_6  vload_partial_6
+#define vload_partial_8_7  vload_partial_7
+#define vload_partial_8_8  vload_partial_8
+#define vload_partial_8_9  NO_LOAD
 #define vload_partial_8_10 NO_LOAD
 #define vload_partial_8_11 NO_LOAD
 #define vload_partial_8_12 NO_LOAD
@@ -316,16 +316,16 @@
 #define vload_partial_8_15 NO_LOAD
 #define vload_partial_8_16 NO_LOAD
 // Size == 16
-#define vload_partial_16_0 NO_LOAD
-#define vload_partial_16_1 vload_partial_1
-#define vload_partial_16_2 vload_partial_2
-#define vload_partial_16_3 vload_partial_3
-#define vload_partial_16_4 vload_partial_4
-#define vload_partial_16_5 vload_partial_5
-#define vload_partial_16_6 vload_partial_6
-#define vload_partial_16_7 vload_partial_7
-#define vload_partial_16_8 vload_partial_8
-#define vload_partial_16_9 vload_partial_9
+#define vload_partial_16_0  NO_LOAD
+#define vload_partial_16_1  vload_partial_1
+#define vload_partial_16_2  vload_partial_2
+#define vload_partial_16_3  vload_partial_3
+#define vload_partial_16_4  vload_partial_4
+#define vload_partial_16_5  vload_partial_5
+#define vload_partial_16_6  vload_partial_6
+#define vload_partial_16_7  vload_partial_7
+#define vload_partial_16_8  vload_partial_8
+#define vload_partial_16_9  vload_partial_9
 #define vload_partial_16_10 vload_partial_10
 #define vload_partial_16_11 vload_partial_11
 #define vload_partial_16_12 vload_partial_12
@@ -351,17 +351,13 @@
  * @param[in] PTR    The base pointer
  * @{
  */
-#define vload_partial_1(DATA, OFFSET, PTR) \
-    DATA.s0 = vload1(OFFSET, PTR);
+#define vload_partial_1(DATA, OFFSET, PTR) DATA.s0 = vload1(OFFSET, PTR);
 
-#define vload_partial_2(DATA, OFFSET, PTR) \
-    DATA.s01 = vload2(OFFSET, PTR);
+#define vload_partial_2(DATA, OFFSET, PTR) DATA.s01 = vload2(OFFSET, PTR);
 
-#define vload_partial_3(DATA, OFFSET, PTR) \
-    DATA.s012 = vload3(OFFSET, PTR);
+#define vload_partial_3(DATA, OFFSET, PTR) DATA.s012 = vload3(OFFSET, PTR);
 
-#define vload_partial_4(DATA, OFFSET, PTR) \
-    DATA.s0123 = vload4(OFFSET, PTR);
+#define vload_partial_4(DATA, OFFSET, PTR) DATA.s0123 = vload4(OFFSET, PTR);
 
 #define vload_partial_5(DATA, OFFSET, PTR)    \
     vload_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -375,8 +371,7 @@
     vload_partial_4(DATA.s0123, OFFSET, PTR); \
     vload_partial_3(DATA.s456, OFFSET, PTR + 4);
 
-#define vload_partial_8(DATA, OFFSET, PTR) \
-    DATA.s01234567 = vload8(OFFSET, PTR);
+#define vload_partial_8(DATA, OFFSET, PTR) DATA.s01234567 = vload8(OFFSET, PTR);
 
 #define vload_partial_9(DATA, OFFSET, PTR)        \
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -406,13 +401,12 @@
     vload_partial_8(DATA.s01234567, OFFSET, PTR); \
     vload_partial_7(DATA.s89ABCDEF, OFFSET, PTR + 8);
 
-#define vload_partial_16(DATA, OFFSET, PTR) \
-    DATA = vload16(OFFSET, PTR);
+#define vload_partial_16(DATA, OFFSET, PTR) DATA = vload16(OFFSET, PTR);
 /** @} */ // end of groupd vload_partial_n
 /** @} */ // end of groupd VLOAD_PARTIAL
 
-#define PIXEL_UNIT4 1
-#define PIXEL_UNIT8 2
+#define PIXEL_UNIT4  1
+#define PIXEL_UNIT8  2
 #define PIXEL_UNIT16 4
 
 /** Utility macro to convert a vector size in pixel unit.
@@ -425,27 +419,45 @@
  * @{
  */
 #define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size) PIXEL_UNIT##vec_size
-#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size) CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
+#define CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(vec_size)     CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT_STR(vec_size)
 /** @} */ // end of group CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT
 
 #define read_image2d_floatx1(img, x_coord, y_coord) (float4)(read_imagef(img, (int2)(x_coord, y_coord)));
-#define read_image2d_floatx2(img, x_coord, y_coord) (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_floatx4(img, x_coord, y_coord) (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_floatx2(img, x_coord, y_coord) \
+    (float8)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_floatx4(img, x_coord, y_coord)                                                       \
+    (float16)(read_imagef(img, (int2)(x_coord, y_coord)), read_imagef(img, (int2)(x_coord + 1, y_coord)), \
+              read_imagef(img, (int2)(x_coord + 2, y_coord)), read_imagef(img, (int2)(x_coord + 3, y_coord)));
 
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #define read_image2d_halfx1(img, x_coord, y_coord) (half4)(read_imageh(img, (int2)(x_coord, y_coord)));
-#define read_image2d_halfx2(img, x_coord, y_coord) (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
-#define read_image2d_halfx4(img, x_coord, y_coord) (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
+#define read_image2d_halfx2(img, x_coord, y_coord) \
+    (half8)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)));
+#define read_image2d_halfx4(img, x_coord, y_coord)                                                       \
+    (half16)(read_imageh(img, (int2)(x_coord, y_coord)), read_imageh(img, (int2)(x_coord + 1, y_coord)), \
+             read_imageh(img, (int2)(x_coord + 2, y_coord)), read_imageh(img, (int2)(x_coord + 3, y_coord)));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
 #define write_image2d_floatx1(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values));
-#define write_image2d_floatx2(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
-#define write_image2d_floatx4(img, x_coord, y_coord, values) (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#define write_image2d_floatx2(img, x_coord, y_coord, values)    \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_floatx4(img, x_coord, y_coord, values)        \
+    (write_imagef(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imagef(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imagef(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imagef(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
 
 #if defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 #define write_image2d_halfx1(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values));
-#define write_image2d_halfx2(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
-#define write_image2d_halfx4(img, x_coord, y_coord, values) (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
+#define write_image2d_halfx2(img, x_coord, y_coord, values)     \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123), \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567));
+#define write_image2d_halfx4(img, x_coord, y_coord, values)         \
+    (write_imageh(img, (int2)(x_coord, y_coord), values.s0123),     \
+     write_imageh(img, (int2)(x_coord + 1, y_coord), values.s4567), \
+     write_imageh(img, (int2)(x_coord + 2, y_coord), values.s89AB), \
+     write_imageh(img, (int2)(x_coord + 3, y_coord), values.sCDEF));
 #endif // defined(ARM_COMPUTE_OPENCL_FP16_ENABLED) && defined(cl_khr_fp16)
 
 /** Utility macro to read a 2D OpenCL image object.
@@ -462,7 +474,7 @@
  * @{
  */
 #define READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord) read_image2d_##data_type##x##n0(img, x_coord, y_coord)
-#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord) READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
+#define READ_IMAGE2D(data_type, n0, img, x_coord, y_coord)     READ_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord)
 /** @} */
 
 /** Utility macro to write a 2D OpenCL image object.
@@ -478,26 +490,28 @@
  *
  * @{
  */
-#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
-#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values) \
+    write_image2d_##data_type##x##n0(img, x_coord, y_coord, values)
+#define WRITE_IMAGE2D(data_type, n0, img, x_coord, y_coord, values) \
+    WRITE_IMAGE2D_STR(data_type, n0, img, x_coord, y_coord, values)
 /** @} */
 
 #define VSTORE_STR(size) vstore##size
-#define VSTORE(size) VSTORE_STR(size)
+#define VSTORE(size)     VSTORE_STR(size)
 
-#define float1 float
-#define half1 half
-#define char1 char
-#define uchar1 uchar
-#define short1 short
+#define float1  float
+#define half1   half
+#define char1   char
+#define uchar1  uchar
+#define short1  short
 #define ushort1 ushort
-#define int1 int
-#define uint1 uint
-#define long1 long
-#define ulong1 ulong
+#define int1    int
+#define uint1   uint
+#define long1   long
+#define ulong1  ulong
 #define double1 double
 
-#define vload1(OFFSET, PTR) *(OFFSET + PTR)
+#define vload1(OFFSET, PTR)        *(OFFSET + PTR)
 #define vstore1(DATA, OFFSET, PTR) *(OFFSET + PTR) = DATA
 
 /** Extended partial vstore that correctly handles scalar values as well.
@@ -516,23 +530,23 @@
  * @{
  */
 #define VSTORE_PARTIAL_STR(size, store_size) vstore_partial_##size##_##store_size
-#define VSTORE_PARTIAL(size, store_size) VSTORE_PARTIAL_STR(size, store_size)
+#define VSTORE_PARTIAL(size, store_size)     VSTORE_PARTIAL_STR(size, store_size)
 
 #define NO_STORE(data, offs, ptr) \
     {                             \
     }
 
 // Size == 1 (scalar)
-#define vstore_partial_1_0 NO_STORE
-#define vstore_partial_1_1 vstore1
-#define vstore_partial_1_2 NO_STORE
-#define vstore_partial_1_3 NO_STORE
-#define vstore_partial_1_4 NO_STORE
-#define vstore_partial_1_5 NO_STORE
-#define vstore_partial_1_6 NO_STORE
-#define vstore_partial_1_7 NO_STORE
-#define vstore_partial_1_8 NO_STORE
-#define vstore_partial_1_9 NO_STORE
+#define vstore_partial_1_0  NO_STORE
+#define vstore_partial_1_1  vstore1
+#define vstore_partial_1_2  NO_STORE
+#define vstore_partial_1_3  NO_STORE
+#define vstore_partial_1_4  NO_STORE
+#define vstore_partial_1_5  NO_STORE
+#define vstore_partial_1_6  NO_STORE
+#define vstore_partial_1_7  NO_STORE
+#define vstore_partial_1_8  NO_STORE
+#define vstore_partial_1_9  NO_STORE
 #define vstore_partial_1_10 NO_STORE
 #define vstore_partial_1_11 NO_STORE
 #define vstore_partial_1_12 NO_STORE
@@ -541,16 +555,16 @@
 #define vstore_partial_1_15 NO_STORE
 #define vstore_partial_1_16 NO_STORE
 // Size == 2
-#define vstore_partial_2_0 NO_STORE
-#define vstore_partial_2_1 vstore_partial_1
-#define vstore_partial_2_2 vstore_partial_2
-#define vstore_partial_2_3 NO_STORE
-#define vstore_partial_2_4 NO_STORE
-#define vstore_partial_2_5 NO_STORE
-#define vstore_partial_2_6 NO_STORE
-#define vstore_partial_2_7 NO_STORE
-#define vstore_partial_2_8 NO_STORE
-#define vstore_partial_2_9 NO_STORE
+#define vstore_partial_2_0  NO_STORE
+#define vstore_partial_2_1  vstore_partial_1
+#define vstore_partial_2_2  vstore_partial_2
+#define vstore_partial_2_3  NO_STORE
+#define vstore_partial_2_4  NO_STORE
+#define vstore_partial_2_5  NO_STORE
+#define vstore_partial_2_6  NO_STORE
+#define vstore_partial_2_7  NO_STORE
+#define vstore_partial_2_8  NO_STORE
+#define vstore_partial_2_9  NO_STORE
 #define vstore_partial_2_10 NO_STORE
 #define vstore_partial_2_11 NO_STORE
 #define vstore_partial_2_12 NO_STORE
@@ -559,16 +573,16 @@
 #define vstore_partial_2_15 NO_STORE
 #define vstore_partial_2_16 NO_STORE
 // Size == 3
-#define vstore_partial_3_0 NO_STORE
-#define vstore_partial_3_1 vstore_partial_1
-#define vstore_partial_3_2 vstore_partial_2
-#define vstore_partial_3_3 vstore_partial_3
-#define vstore_partial_3_4 NO_STORE
-#define vstore_partial_3_5 NO_STORE
-#define vstore_partial_3_6 NO_STORE
-#define vstore_partial_3_7 NO_STORE
-#define vstore_partial_3_8 NO_STORE
-#define vstore_partial_3_9 NO_STORE
+#define vstore_partial_3_0  NO_STORE
+#define vstore_partial_3_1  vstore_partial_1
+#define vstore_partial_3_2  vstore_partial_2
+#define vstore_partial_3_3  vstore_partial_3
+#define vstore_partial_3_4  NO_STORE
+#define vstore_partial_3_5  NO_STORE
+#define vstore_partial_3_6  NO_STORE
+#define vstore_partial_3_7  NO_STORE
+#define vstore_partial_3_8  NO_STORE
+#define vstore_partial_3_9  NO_STORE
 #define vstore_partial_3_10 NO_STORE
 #define vstore_partial_3_11 NO_STORE
 #define vstore_partial_3_12 NO_STORE
@@ -577,16 +591,16 @@
 #define vstore_partial_3_15 NO_STORE
 #define vstore_partial_3_16 NO_STORE
 // Size == 4
-#define vstore_partial_4_0 NO_STORE
-#define vstore_partial_4_1 vstore_partial_1
-#define vstore_partial_4_2 vstore_partial_2
-#define vstore_partial_4_3 vstore_partial_3
-#define vstore_partial_4_4 vstore_partial_4
-#define vstore_partial_4_5 NO_STORE
-#define vstore_partial_4_6 NO_STORE
-#define vstore_partial_4_7 NO_STORE
-#define vstore_partial_4_8 NO_STORE
-#define vstore_partial_4_9 NO_STORE
+#define vstore_partial_4_0  NO_STORE
+#define vstore_partial_4_1  vstore_partial_1
+#define vstore_partial_4_2  vstore_partial_2
+#define vstore_partial_4_3  vstore_partial_3
+#define vstore_partial_4_4  vstore_partial_4
+#define vstore_partial_4_5  NO_STORE
+#define vstore_partial_4_6  NO_STORE
+#define vstore_partial_4_7  NO_STORE
+#define vstore_partial_4_8  NO_STORE
+#define vstore_partial_4_9  NO_STORE
 #define vstore_partial_4_10 NO_STORE
 #define vstore_partial_4_11 NO_STORE
 #define vstore_partial_4_12 NO_STORE
@@ -595,16 +609,16 @@
 #define vstore_partial_4_15 NO_STORE
 #define vstore_partial_4_16 NO_STORE
 // Size == 8
-#define vstore_partial_8_0 NO_STORE
-#define vstore_partial_8_1 vstore_partial_1
-#define vstore_partial_8_2 vstore_partial_2
-#define vstore_partial_8_3 vstore_partial_3
-#define vstore_partial_8_4 vstore_partial_4
-#define vstore_partial_8_5 vstore_partial_5
-#define vstore_partial_8_6 vstore_partial_6
-#define vstore_partial_8_7 vstore_partial_7
-#define vstore_partial_8_8 vstore_partial_8
-#define vstore_partial_8_9 NO_STORE
+#define vstore_partial_8_0  NO_STORE
+#define vstore_partial_8_1  vstore_partial_1
+#define vstore_partial_8_2  vstore_partial_2
+#define vstore_partial_8_3  vstore_partial_3
+#define vstore_partial_8_4  vstore_partial_4
+#define vstore_partial_8_5  vstore_partial_5
+#define vstore_partial_8_6  vstore_partial_6
+#define vstore_partial_8_7  vstore_partial_7
+#define vstore_partial_8_8  vstore_partial_8
+#define vstore_partial_8_9  NO_STORE
 #define vstore_partial_8_10 NO_STORE
 #define vstore_partial_8_11 NO_STORE
 #define vstore_partial_8_12 NO_STORE
@@ -613,16 +627,16 @@
 #define vstore_partial_8_15 NO_STORE
 #define vstore_partial_8_16 NO_STORE
 // Size == 16
-#define vstore_partial_16_0 NO_STORE
-#define vstore_partial_16_1 vstore_partial_1
-#define vstore_partial_16_2 vstore_partial_2
-#define vstore_partial_16_3 vstore_partial_3
-#define vstore_partial_16_4 vstore_partial_4
-#define vstore_partial_16_5 vstore_partial_5
-#define vstore_partial_16_6 vstore_partial_6
-#define vstore_partial_16_7 vstore_partial_7
-#define vstore_partial_16_8 vstore_partial_8
-#define vstore_partial_16_9 vstore_partial_9
+#define vstore_partial_16_0  NO_STORE
+#define vstore_partial_16_1  vstore_partial_1
+#define vstore_partial_16_2  vstore_partial_2
+#define vstore_partial_16_3  vstore_partial_3
+#define vstore_partial_16_4  vstore_partial_4
+#define vstore_partial_16_5  vstore_partial_5
+#define vstore_partial_16_6  vstore_partial_6
+#define vstore_partial_16_7  vstore_partial_7
+#define vstore_partial_16_8  vstore_partial_8
+#define vstore_partial_16_9  vstore_partial_9
 #define vstore_partial_16_10 vstore_partial_10
 #define vstore_partial_16_11 vstore_partial_11
 #define vstore_partial_16_12 vstore_partial_12
@@ -648,17 +662,13 @@
  * @param[in] PTR    The base pointer
  * @{
  */
-#define vstore_partial_1(DATA, OFFSET, PTR) \
-    vstore1(DATA.s0, OFFSET, PTR);
+#define vstore_partial_1(DATA, OFFSET, PTR) vstore1(DATA.s0, OFFSET, PTR);
 
-#define vstore_partial_2(DATA, OFFSET, PTR) \
-    vstore2(DATA.s01, OFFSET, PTR);
+#define vstore_partial_2(DATA, OFFSET, PTR) vstore2(DATA.s01, OFFSET, PTR);
 
-#define vstore_partial_3(DATA, OFFSET, PTR) \
-    vstore3(DATA.s012, OFFSET, PTR);
+#define vstore_partial_3(DATA, OFFSET, PTR) vstore3(DATA.s012, OFFSET, PTR);
 
-#define vstore_partial_4(DATA, OFFSET, PTR) \
-    vstore4(DATA.s0123, OFFSET, PTR);
+#define vstore_partial_4(DATA, OFFSET, PTR) vstore4(DATA.s0123, OFFSET, PTR);
 
 #define vstore_partial_5(DATA, OFFSET, PTR)    \
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
@@ -672,8 +682,7 @@
     vstore_partial_4(DATA.s0123, OFFSET, PTR); \
     vstore_partial_3(DATA.s456, OFFSET, PTR + 4);
 
-#define vstore_partial_8(DATA, OFFSET, PTR) \
-    vstore8(DATA.s01234567, OFFSET, PTR);
+#define vstore_partial_8(DATA, OFFSET, PTR) vstore8(DATA.s01234567, OFFSET, PTR);
 
 #define vstore_partial_9(DATA, OFFSET, PTR)        \
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
@@ -703,186 +712,156 @@
     vstore_partial_8(DATA.s01234567, OFFSET, PTR); \
     vstore_partial_7(DATA.s89abcdef, OFFSET, PTR + 8);
 
-#define vstore_partial_16(DATA, OFFSET, PTR) \
-    vstore16(DATA, OFFSET, PTR);
+#define vstore_partial_16(DATA, OFFSET, PTR) vstore16(DATA, OFFSET, PTR);
 /** @} */ // end of groupd vstore_partial_n
 /** @} */ // end of groupd VSTORE_PARTIAL
 
 // Convert built-in functions with _sat modifier are not supported in floating point so we create defines
 // without _sat to overcome this issue
-#define convert_float_sat convert_float
-#define convert_float1_sat convert_float
-#define convert_float2_sat convert_float2
-#define convert_float3_sat convert_float3
-#define convert_float4_sat convert_float4
-#define convert_float8_sat convert_float8
+#define convert_float_sat   convert_float
+#define convert_float1_sat  convert_float
+#define convert_float2_sat  convert_float2
+#define convert_float3_sat  convert_float3
+#define convert_float4_sat  convert_float4
+#define convert_float8_sat  convert_float8
 #define convert_float16_sat convert_float16
-#define convert_half_sat convert_float
-#define convert_half1_sat convert_half
-#define convert_half2_sat convert_half2
-#define convert_half3_sat convert_half3
-#define convert_half4_sat convert_half4
-#define convert_half8_sat convert_half8
-#define convert_half16_sat convert_half16
+#define convert_half_sat    convert_float
+#define convert_half1_sat   convert_half
+#define convert_half2_sat   convert_half2
+#define convert_half3_sat   convert_half3
+#define convert_half4_sat   convert_half4
+#define convert_half8_sat   convert_half8
+#define convert_half16_sat  convert_half16
 
-#define convert_float1 convert_float
-#define convert_half1 convert_half
-#define convert_char1 convert_char
-#define convert_uchar1 convert_uchar
-#define convert_short1 convert_short
+#define convert_float1  convert_float
+#define convert_half1   convert_half
+#define convert_char1   convert_char
+#define convert_uchar1  convert_uchar
+#define convert_short1  convert_short
 #define convert_ushort1 convert_ushort
-#define convert_int1 convert_int
-#define convert_uint1 convert_uint
-#define convert_long1 convert_long
-#define convert_ulong1 convert_ulong
+#define convert_int1    convert_int
+#define convert_uint1   convert_uint
+#define convert_long1   convert_long
+#define convert_ulong1  convert_ulong
 #define convert_double1 convert_double
 
-#define convert_char1_sat convert_char_sat
-#define convert_uchar1_sat convert_uchar_sat
-#define convert_uchar2_sat convert_uchar2_sat
-#define convert_uchar3_sat convert_uchar3_sat
-#define convert_uchar4_sat convert_uchar4_sat
-#define convert_uchar8_sat convert_uchar8_sat
+#define convert_char1_sat   convert_char_sat
+#define convert_uchar1_sat  convert_uchar_sat
+#define convert_uchar2_sat  convert_uchar2_sat
+#define convert_uchar3_sat  convert_uchar3_sat
+#define convert_uchar4_sat  convert_uchar4_sat
+#define convert_uchar8_sat  convert_uchar8_sat
 #define convert_uchar16_sat convert_uchar16_sat
-#define convert_short1_sat convert_short_sat
+#define convert_short1_sat  convert_short_sat
 #define convert_ushort1_sat convert_ushort_sat
-#define convert_int1_sat convert_int_sat
-#define convert_uint1_sat convert_uint_sat
-#define convert_long1_sat convert_long_sat
-#define convert_ulong1_sat convert_ulong_sat
+#define convert_int1_sat    convert_int_sat
+#define convert_uint1_sat   convert_uint_sat
+#define convert_long1_sat   convert_long_sat
+#define convert_ulong1_sat  convert_ulong_sat
 #define convert_double1_sat convert_double_sat
 
 #define VEC_DATA_TYPE_STR(type, size) type##size
-#define VEC_DATA_TYPE(type, size) VEC_DATA_TYPE_STR(type, size)
+#define VEC_DATA_TYPE(type, size)     VEC_DATA_TYPE_STR(type, size)
 
 #define CONVERT_STR(x, type) (convert_##type((x)))
-#define CONVERT(x, type) CONVERT_STR(x, type)
+#define CONVERT(x, type)     CONVERT_STR(x, type)
 
 #define CONVERT_SAT_STR(x, type) (convert_##type##_sat((x)))
-#define CONVERT_SAT(x, type) CONVERT_SAT_STR(x, type)
+#define CONVERT_SAT(x, type)     CONVERT_SAT_STR(x, type)
 
 #define CONVERT_SAT_ROUND_STR(x, type, round) (convert_##type##_sat_##round((x)))
-#define CONVERT_SAT_ROUND(x, type, round) CONVERT_SAT_ROUND_STR(x, type, round)
+#define CONVERT_SAT_ROUND(x, type, round)     CONVERT_SAT_ROUND_STR(x, type, round)
 
-#define select_vec_dt_uchar(size) uchar##size
-#define select_vec_dt_char(size) char##size
+#define select_vec_dt_uchar(size)  uchar##size
+#define select_vec_dt_char(size)   char##size
 #define select_vec_dt_ushort(size) ushort##size
-#define select_vec_dt_short(size) short##size
-#define select_vec_dt_half(size) short##size
-#define select_vec_dt_uint(size) uint##size
-#define select_vec_dt_int(size) int##size
-#define select_vec_dt_float(size) int##size
-#define select_vec_dt_ulong(size) ulong##size
-#define select_vec_dt_long(size) long##size
+#define select_vec_dt_short(size)  short##size
+#define select_vec_dt_half(size)   short##size
+#define select_vec_dt_uint(size)   uint##size
+#define select_vec_dt_int(size)    int##size
+#define select_vec_dt_float(size)  int##size
+#define select_vec_dt_ulong(size)  ulong##size
+#define select_vec_dt_long(size)   long##size
 
 #define SELECT_VEC_DATA_TYPE_STR(type, size) select_vec_dt_##type(size)
-#define SELECT_VEC_DATA_TYPE(type, size) SELECT_VEC_DATA_TYPE_STR(type, size)
-#define SELECT_DATA_TYPE(type) SELECT_VEC_DATA_TYPE_STR(type, 1)
+#define SELECT_VEC_DATA_TYPE(type, size)     SELECT_VEC_DATA_TYPE_STR(type, size)
+#define SELECT_DATA_TYPE(type)               SELECT_VEC_DATA_TYPE_STR(type, 1)
 
-#define signed_int_vec_dt_uchar(size) char##size
-#define signed_int_vec_dt_char(size) char##size
+#define signed_int_vec_dt_uchar(size)  char##size
+#define signed_int_vec_dt_char(size)   char##size
 #define signed_int_vec_dt_ushort(size) short##size
-#define signed_int_vec_dt_short(size) short##size
-#define signed_int_vec_dt_half(size) short##size
-#define signed_int_vec_dt_uint(size) int##size
-#define signed_int_vec_dt_int(size) int##size
-#define signed_int_vec_dt_float(size) int##size
-#define signed_int_vec_dt_ulong(size) long##size
-#define signed_int_vec_dt_long(size) long##size
+#define signed_int_vec_dt_short(size)  short##size
+#define signed_int_vec_dt_half(size)   short##size
+#define signed_int_vec_dt_uint(size)   int##size
+#define signed_int_vec_dt_int(size)    int##size
+#define signed_int_vec_dt_float(size)  int##size
+#define signed_int_vec_dt_ulong(size)  long##size
+#define signed_int_vec_dt_long(size)   long##size
 
 #define SIGNED_INT_VEC_DATA_TYPE_STR(type, size) signed_int_vec_dt_##type(size)
-#define SIGNED_INT_VEC_DATA_TYPE(type, size) SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
-#define SIGNED_INT_DATA_TYPE(type) SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
+#define SIGNED_INT_VEC_DATA_TYPE(type, size)     SIGNED_INT_VEC_DATA_TYPE_STR(type, size)
+#define SIGNED_INT_DATA_TYPE(type)               SIGNED_INT_VEC_DATA_TYPE_STR(type, 1)
 
-#define sum_reduce_1(x) (x)
-#define sum_reduce_2(x) ((x).s0) + ((x).s1)
-#define sum_reduce_3(x) sum_reduce_2((x).s01) + ((x).s2)
-#define sum_reduce_4(x) sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
-#define sum_reduce_8(x) sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
+#define sum_reduce_1(x)  (x)
+#define sum_reduce_2(x)  ((x).s0) + ((x).s1)
+#define sum_reduce_3(x)  sum_reduce_2((x).s01) + ((x).s2)
+#define sum_reduce_4(x)  sum_reduce_2((x).s01) + sum_reduce_2((x).s23)
+#define sum_reduce_8(x)  sum_reduce_4((x).s0123) + sum_reduce_4((x).s4567)
 #define sum_reduce_16(x) sum_reduce_8((x).s01234567) + sum_reduce_8((x).s89ABCDEF)
 
 #define SUM_REDUCE_STR(x, size) sum_reduce_##size(x)
-#define SUM_REDUCE(x, size) SUM_REDUCE_STR(x, size)
+#define SUM_REDUCE(x, size)     SUM_REDUCE_STR(x, size)
 
-#define prod_reduce_1(x) (x)
-#define prod_reduce_2(x) ((x).s0) * ((x).s1)
-#define prod_reduce_3(x) prod_reduce_2((x).s01) * ((x).s2)
-#define prod_reduce_4(x) prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
-#define prod_reduce_8(x) prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
+#define prod_reduce_1(x)  (x)
+#define prod_reduce_2(x)  ((x).s0) * ((x).s1)
+#define prod_reduce_3(x)  prod_reduce_2((x).s01) * ((x).s2)
+#define prod_reduce_4(x)  prod_reduce_2((x).s01) * prod_reduce_2((x).s23)
+#define prod_reduce_8(x)  prod_reduce_4((x).s0123) * prod_reduce_4((x).s4567)
 #define prod_reduce_16(x) prod_reduce_8((x).s01234567) * prod_reduce_8((x).s89ABCDEF)
 
 #define PROD_REDUCE_STR(x, size) prod_reduce_##size(x)
-#define PROD_REDUCE(x, size) PROD_REDUCE_STR(x, size)
+#define PROD_REDUCE(x, size)     PROD_REDUCE_STR(x, size)
 
-#define max_reduce_1(x) (x)
-#define max_reduce_2(x) max(((x).s0), ((x).s1))
-#define max_reduce_3(x) max(max_reduce_2((x).s01), ((x).s2))
-#define max_reduce_4(x) max(max_reduce_2((x).s01), max_reduce_2((x).s23))
-#define max_reduce_8(x) max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
+#define max_reduce_1(x)  (x)
+#define max_reduce_2(x)  max(((x).s0), ((x).s1))
+#define max_reduce_3(x)  max(max_reduce_2((x).s01), ((x).s2))
+#define max_reduce_4(x)  max(max_reduce_2((x).s01), max_reduce_2((x).s23))
+#define max_reduce_8(x)  max(max_reduce_4((x).s0123), max_reduce_4((x).s4567))
 #define max_reduce_16(x) max(max_reduce_8((x).s01234567), max_reduce_8((x).s89ABCDEF))
 
 #define MAX_REDUCE_STR(x, size) max_reduce_##size(x)
-#define MAX_REDUCE(x, size) MAX_REDUCE_STR(x, size)
+#define MAX_REDUCE(x, size)     MAX_REDUCE_STR(x, size)
 
-#define min_reduce_1(x) (x)
-#define min_reduce_2(x) min(((x).s0), ((x).s1))
-#define min_reduce_3(x) min(min_reduce_2((x).s01), ((x).s2))
-#define min_reduce_4(x) min(min_reduce_2((x).s01), min_reduce_2((x).s23))
-#define min_reduce_8(x) min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
+#define min_reduce_1(x)  (x)
+#define min_reduce_2(x)  min(((x).s0), ((x).s1))
+#define min_reduce_3(x)  min(min_reduce_2((x).s01), ((x).s2))
+#define min_reduce_4(x)  min(min_reduce_2((x).s01), min_reduce_2((x).s23))
+#define min_reduce_8(x)  min(min_reduce_4((x).s0123), min_reduce_4((x).s4567))
 #define min_reduce_16(x) min(min_reduce_8((x).s01234567), min_reduce_8((x).s89ABCDEF))
 
 #define MIN_REDUCE_STR(x, size) min_reduce_##size(x)
-#define MIN_REDUCE(x, size) MIN_REDUCE_STR(x, size)
+#define MIN_REDUCE(x, size)     MIN_REDUCE_STR(x, size)
 
-#define VECTOR_DECLARATION(name)     \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_offset_first_element_in_bytes
+#define VECTOR_DECLARATION(name) \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_offset_first_element_in_bytes
 
-#define IMAGE_DECLARATION(name)      \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_offset_first_element_in_bytes
+#define IMAGE_DECLARATION(name)                                                                                     \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_offset_first_element_in_bytes
 
-#define TENSOR3D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_offset_first_element_in_bytes
+#define TENSOR3D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_offset_first_element_in_bytes
 
-#define TENSOR4D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_stride_w, \
-    uint        name##_step_w,   \
-    uint        name##_offset_first_element_in_bytes
+#define TENSOR4D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w,                         \
+        uint name##_offset_first_element_in_bytes
 
-#define TENSOR5D_DECLARATION(name)   \
-    __global uchar *name##_ptr,      \
-    uint        name##_stride_x, \
-    uint        name##_step_x,   \
-    uint        name##_stride_y, \
-    uint        name##_step_y,   \
-    uint        name##_stride_z, \
-    uint        name##_step_z,   \
-    uint        name##_stride_w, \
-    uint        name##_step_w,   \
-    uint        name##_stride_v, \
-    uint        name##_step_v,   \
-    uint        name##_offset_first_element_in_bytes
+#define TENSOR5D_DECLARATION(name)                                                                                  \
+    __global uchar *name##_ptr, uint name##_stride_x, uint name##_step_x, uint name##_stride_y, uint name##_step_y, \
+        uint name##_stride_z, uint name##_step_z, uint name##_stride_w, uint name##_step_w, uint name##_stride_v,   \
+        uint name##_step_v, uint name##_offset_first_element_in_bytes
 
 #define CONVERT_TO_VECTOR_STRUCT(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x)
@@ -890,38 +869,47 @@
 #define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name) \
     update_vector_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0)
 
-#define CONVERT_TO_IMAGE_STRUCT(name) \
-    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y)
+#define CONVERT_TO_IMAGE_STRUCT(name)                                                                           \
+    update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                              name##_stride_y, name##_step_y)
 
 #define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name) \
     update_image_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT_NO_STEP(name)                                                            \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                            name##_stride_y, 0, name##_stride_z, name##_step_z)
 
-#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name) \
-    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
+#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)                                                                 \
+    update_image_from_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, \
+                                            name##_step_x, name##_stride_y, name##_step_y, name##_stride_z,    \
+                                            name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                                                           \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT(name)                                                                           \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name) \
-    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)                                                       \
+    update_tensor3D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                 name##_stride_y, 0, name##_stride_z, 0)
 
-#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                                                 \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                                 name##_stride_z, name##_step_z, name##_stride_w, name##_step_w, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)                                                                 \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                                 name##_stride_y, name##_step_y, name##_stride_z, name##_step_z, name##_stride_w,  \
+                                 name##_step_w, mod_size)
 
-#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size) \
-    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
+#define CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(name, mod_size)                                             \
+    update_tensor4D_workitem_ptr(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, 0, \
+                                 name##_stride_y, 0, name##_stride_z, 0, name##_stride_w, 0, mod_size)
 
-#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                                                       \
-    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, name##_stride_y, name##_step_y, \
-                           name##_stride_z, name##_step_z)
+#define CONVERT_TO_TENSOR3D_STRUCT_NO_UPDATE_PTR(name)                                                       \
+    tensor3D_ptr_no_update(name##_ptr, name##_offset_first_element_in_bytes, name##_stride_x, name##_step_x, \
+                           name##_stride_y, name##_step_y, name##_stride_z, name##_step_z)
 
 /** Structure to hold Vector information */
 typedef struct Vector
@@ -970,10 +958,10 @@
  *
  * @return An image object
  */
-inline Vector update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
+inline Vector
+update_vector_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x)
 {
-    Vector vector =
-    {
+    Vector vector = {
         .ptr                           = ptr,
         .offset_first_element_in_bytes = offset_first_element_in_bytes,
         .stride_x                      = stride_x,
@@ -993,15 +981,13 @@
  *
  * @return An image object
  */
-inline Image update_image_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
+inline Image update_image_workitem_ptr(
+    __global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
     img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y;
     return img;
 }
@@ -1019,16 +1005,21 @@
  *
  * @return A 3D tensor object
  */
-inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Image update_image_from_tensor3D_workitem_ptr(__global uchar *ptr,
+                                                     uint            offset_first_element_in_bytes,
+                                                     uint            stride_x,
+                                                     uint            step_x,
+                                                     uint            stride_y,
+                                                     uint            step_y,
+                                                     uint            stride_z,
+                                                     uint            step_z)
 {
-    Image img =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y
-    };
-    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Image img = {.ptr                           = ptr,
+                 .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                 .stride_x                      = stride_x,
+                 .stride_y                      = stride_y};
+    img.ptr += img.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+               get_global_id(2) * step_z;
     return img;
 }
 
@@ -1045,17 +1036,22 @@
  *
  * @return A 3D tensor object
  */
-inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D update_tensor3D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + get_global_id(2) * step_z;
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  get_global_id(2) * step_z;
     return tensor;
 }
 
@@ -1072,34 +1068,44 @@
  *
  * @return A 3D tensor object
  */
-inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z)
+inline Tensor3D tensor3D_ptr_no_update(__global uchar *ptr,
+                                       uint            offset_first_element_in_bytes,
+                                       uint            stride_x,
+                                       uint            step_x,
+                                       uint            stride_y,
+                                       uint            step_y,
+                                       uint            stride_z,
+                                       uint            step_z)
 {
-    Tensor3D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z
-    };
+    Tensor3D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z};
     return tensor;
 }
 
-inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr, uint offset_first_element_in_bytes, uint stride_x, uint step_x, uint stride_y, uint step_y, uint stride_z, uint step_z, uint stride_w,
-                                             uint step_w,
-                                             uint mod_size)
+inline Tensor4D update_tensor4D_workitem_ptr(__global uchar *ptr,
+                                             uint            offset_first_element_in_bytes,
+                                             uint            stride_x,
+                                             uint            step_x,
+                                             uint            stride_y,
+                                             uint            step_y,
+                                             uint            stride_z,
+                                             uint            step_z,
+                                             uint            stride_w,
+                                             uint            step_w,
+                                             uint            mod_size)
 {
-    Tensor4D tensor =
-    {
-        .ptr                           = ptr,
-        .offset_first_element_in_bytes = offset_first_element_in_bytes,
-        .stride_x                      = stride_x,
-        .stride_y                      = stride_y,
-        .stride_z                      = stride_z,
-        .stride_w                      = stride_w
-    };
+    Tensor4D tensor = {.ptr                           = ptr,
+                       .offset_first_element_in_bytes = offset_first_element_in_bytes,
+                       .stride_x                      = stride_x,
+                       .stride_y                      = stride_y,
+                       .stride_z                      = stride_z,
+                       .stride_w                      = stride_w};
 
-    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y + (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
+    tensor.ptr += tensor.offset_first_element_in_bytes + get_global_id(0) * step_x + get_global_id(1) * step_y +
+                  (get_global_id(2) % mod_size) * step_z + (get_global_id(2) / mod_size) * step_w;
     return tensor;
 }
 
@@ -1171,7 +1177,8 @@
 
     const uint x = index;
 
-    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z + tensor->offset_first_element_in_bytes;
+    return tensor->ptr + x * tensor->stride_x + y * tensor->stride_y + z * tensor->stride_z +
+           tensor->offset_first_element_in_bytes;
 }
 
 #endif // _HELPER_H

diff --git a/src/core/CL/cl_kernels/helpers_asymm.h b/src/core/CL/cl_kernels/helpers_asymm.h
index 562c5d3..166260a 100644
--- a/src/core/CL/cl_kernels/helpers_asymm.h
+++ b/src/core/CL/cl_kernels/helpers_asymm.h

@@ -34,7 +34,7 @@
  * @return The converted vector
  */
 #define CONVERT_DOWN_RTE_STR(x, type) (convert_##type##_rte((x)))
-#define CONVERT_DOWN_RTE(x, type) CONVERT_DOWN_RTE_STR(x, type)
+#define CONVERT_DOWN_RTE(x, type)     CONVERT_DOWN_RTE_STR(x, type)
 
 /** Quantize a floating-point scalar value to 8-bit asymmetric
  *
@@ -84,14 +84,15 @@
  *
  * @return quantized values
  */
-#define QUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(type, size) quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale) \
-    {                                                                                                                   \
-        VEC_DATA_TYPE(float, size)                                                                                      \
-        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);                   \
-        VEC_DATA_TYPE(type, size)                                                                                       \
-        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size));              \
-        return res;                                                                                                     \
+#define QUANTIZE_IMPL(type, size)                                                                          \
+    inline VEC_DATA_TYPE(type, size)                                                                       \
+        quantize_##type##size(VEC_DATA_TYPE(float, size) input, float offset, float scale)                 \
+    {                                                                                                      \
+        VEC_DATA_TYPE(float, size)                                                                         \
+        out_f32 = input / (VEC_DATA_TYPE(float, size))(scale) + (VEC_DATA_TYPE(float, size))(offset);      \
+        VEC_DATA_TYPE(type, size)                                                                          \
+        res = CONVERT_SAT(CONVERT_DOWN_RTE(out_f32, VEC_DATA_TYPE(int, size)), VEC_DATA_TYPE(type, size)); \
+        return res;                                                                                        \
     }
 
 /** Dequantize a vector of values to floating-point
@@ -101,10 +102,11 @@
  *
  * @return dequantized values in floating point
  */
-#define DEQUANTIZE_IMPL(type, size)                                                                                       \
-    inline VEC_DATA_TYPE(float, size) dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
-    {                                                                                                                     \
-        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;                                             \
+#define DEQUANTIZE_IMPL(type, size)                                                         \
+    inline VEC_DATA_TYPE(float, size)                                                       \
+        dequantize_##type##size(VEC_DATA_TYPE(type, size) input, float offset, float scale) \
+    {                                                                                       \
+        return (CONVERT(input, VEC_DATA_TYPE(float, size)) - offset) * scale;               \
     }
 
 /** Correctly-rounded-to-nearest division by a power-of-two.
@@ -113,18 +115,17 @@
  *
  * @return Correctly-rounded-to-nearest division by a power-of-two.
  */
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                                                        \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent) \
-    {                                                                                                                                   \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        zero = (VEC_DATA_TYPE(int, size))0;                                                                                         \
-        const VEC_DATA_TYPE(int, size)                                                                                                  \
-        one = (VEC_DATA_TYPE(int, size))1;                                                                                          \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        mask = (one << exponent) - one;                                                                                                 \
-        VEC_DATA_TYPE(int, size)                                                                                                        \
-        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                                          \
-        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold));                          \
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2_IMPL(size)                                                               \
+    inline VEC_DATA_TYPE(int, size)                                                                            \
+        asymm_rounding_divide_by_POW2_##size(VEC_DATA_TYPE(int, size) x, VEC_DATA_TYPE(int, size) exponent)    \
+    {                                                                                                          \
+        const VEC_DATA_TYPE(int, size) zero = (VEC_DATA_TYPE(int, size))0;                                     \
+        const VEC_DATA_TYPE(int, size) one  = (VEC_DATA_TYPE(int, size))1;                                     \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        mask = (one << exponent) - one;                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                               \
+        threshold = (mask >> 1) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))(x < 0));                 \
+        return (x >> exponent) + select(zero, one, (SELECT_VEC_DATA_TYPE(int, size))((x & mask) > threshold)); \
     }
 
 /** Product of two numbers, interpreting them as fixed-point values in the interval [-1, 1),
@@ -167,27 +168,29 @@
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a) \
-    {                                                                                                                               \
-        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                                              \
-        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                                               \
-        const int k_fractional_bits = 31;                                                                                           \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x = a + (1 << (k_fractional_bits - 3));                                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x2 = ASYMM_MULT(x, x, size);                                                                                                \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x3 = ASYMM_MULT(x2, x, size);                                                                                               \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4 = ASYMM_MULT(x2, x2, size);                                                                                              \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                                                     \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;                             \
-        VEC_DATA_TYPE(int, size)                                                                                                    \
-        x4_over_24_plus_x3_over_6_plus_x2_over_2 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);       \
-        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size);                       \
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL_IMPL(size)                              \
+    inline VEC_DATA_TYPE(int, size)                                                                           \
+        asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(VEC_DATA_TYPE(int, size) a)       \
+    {                                                                                                         \
+        const VEC_DATA_TYPE(int, size) constant_term     = 1895147668;                                        \
+        const VEC_DATA_TYPE(int, size) constant_1_over_3 = 715827883;                                         \
+        const int k_fractional_bits                      = 31;                                                \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x = a + (1 << (k_fractional_bits - 3));                                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x2 = ASYMM_MULT(x, x, size);                                                                          \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x3 = ASYMM_MULT(x2, x, size);                                                                         \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4 = ASYMM_MULT(x2, x2, size);                                                                        \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_4 = ASYMM_ROUNDING_DIVIDE_BY_POW2(x4, 2, size);                                               \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2 = ASYMM_MULT((x4_over_4 + x3), constant_1_over_3, size) + x2;       \
+        VEC_DATA_TYPE(int, size)                                                                              \
+        x4_over_24_plus_x3_over_6_plus_x2_over_2 =                                                            \
+            ASYMM_ROUNDING_DIVIDE_BY_POW2(x4_over_24_plus_x3_over_6_plus_x2, 1, size);                        \
+        return constant_term + ASYMM_MULT(constant_term, x + x4_over_24_plus_x3_over_6_plus_x2_over_2, size); \
     }
 
 /** Each bit of the result is set to the corresponding bit of either then_val or
@@ -198,10 +201,11 @@
  *
  * @returns Result contaning bits from @p then_val or from @p else_val depending on corresponding bit in @p if_mask is set or not.
  */
-#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
-    {                                                                                                                                                                     \
-        return (if_mask & then_val) ^ (~if_mask & else_val);                                                                                                              \
+#define ASYMM_SELECT_USING_MASK_IMPL(size)                                                                      \
+    inline VEC_DATA_TYPE(int, size) asymm_select_using_mask##size(                                              \
+        VEC_DATA_TYPE(int, size) if_mask, VEC_DATA_TYPE(int, size) then_val, VEC_DATA_TYPE(int, size) else_val) \
+    {                                                                                                           \
+        return (if_mask & then_val) ^ (~if_mask & else_val);                                                    \
     }
 
 /** For each element of input vector, the corresponding bits of the result item are set
@@ -234,18 +238,19 @@
         return select(all_zeros, all_ones, (SELECT_VEC_DATA_TYPE(int, size))(a != 0));       \
     }
 
-#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                                                                                                         \
-    inline VEC_DATA_TYPE(int, size) exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder) \
-    {                                                                                                                                                                                                         \
-        if(k_integer_bits > exponent)                                                                                                                                                                         \
-        {                                                                                                                                                                                                     \
-            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                                                                                                          \
-            return ASYMM_SELECT_USING_MASK(                                                                                                                                                                   \
-                    ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),                                                                                                                              \
-                    ASYMM_MULT(result, fp_multiplier, size), result, size);                                                                                                                                       \
-        }                                                                                                                                                                                                     \
-        \
-        return result;                                                                                                                                                                                        \
+#define EXP_BARREL_SHIFTER_IMPL(size)                                                                                  \
+    inline VEC_DATA_TYPE(int, size)                                                                                    \
+        exp_barrel_shifter##size(VEC_DATA_TYPE(int, size) result, int exponent, int fp_multiplier, int k_integer_bits, \
+                                 int k_fractional_bits, VEC_DATA_TYPE(int, size) remainder)                            \
+    {                                                                                                                  \
+        if (k_integer_bits > exponent)                                                                                 \
+        {                                                                                                              \
+            const int k_shift_amount = k_integer_bits > exponent ? k_fractional_bits + exponent : 0;                   \
+            return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(remainder & (1 << k_shift_amount), size),            \
+                                           ASYMM_MULT(result, fp_multiplier, size), result, size);                     \
+        }                                                                                                              \
+                                                                                                                       \
+        return result;                                                                                                 \
     }
 
 /** Calculates \f$ exp(x) \f$ for x < 0.
@@ -254,39 +259,40 @@
  *
  * @return Result in fixed-point format Q0.
  */
-#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                               \
-    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits)        \
-    {                                                                                                                         \
-        const int k_fractional_bits = 31 - k_integer_bits;                                                                    \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        mask = k_one_quarter - 1;                                                                                             \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                         \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                           \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, size); \
-        VEC_DATA_TYPE(int, size)                                                                                              \
-        remainder = a_mod_quarter_minus_one_quarter - a;                                                                      \
-        \
-        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);              \
-        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);               \
-        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);                \
-        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);                  \
-        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);                     \
-        \
-        if(k_integer_bits > 5)                                                                                                \
-        {                                                                                                                     \
-            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                           \
-            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                       \
-        }                                                                                                                     \
-        \
-        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                                      \
-        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                                    \
+#define ASYMM_EXP_ON_NEGATIVE_VALUES_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size) asymm_exp_on_negative_values##size(VEC_DATA_TYPE(int, size) a, int k_integer_bits) \
+    {                                                                                                                  \
+        const int k_fractional_bits = 31 - k_integer_bits;                                                             \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        k_one_quarter = 1 << (k_fractional_bits - 2);                                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        mask = k_one_quarter - 1;                                                                                      \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter = (a & mask) - k_one_quarter;                                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        a_mod_quarter_minus_one_quarter_scaled = a_mod_quarter_minus_one_quarter << k_integer_bits;                    \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        result = ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a_mod_quarter_minus_one_quarter_scaled, \
+                                                                               size);                                  \
+        VEC_DATA_TYPE(int, size)                                                                                       \
+        remainder = a_mod_quarter_minus_one_quarter - a;                                                               \
+                                                                                                                       \
+        result = EXP_BARREL_SHIFTER(result, -2, 1672461947, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, -1, 1302514674, k_integer_bits, k_fractional_bits, remainder, size);       \
+        result = EXP_BARREL_SHIFTER(result, +0, 790015084, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +1, 290630308, k_integer_bits, k_fractional_bits, remainder, size);        \
+        result = EXP_BARREL_SHIFTER(result, +2, 39332535, k_integer_bits, k_fractional_bits, remainder, size);         \
+        result = EXP_BARREL_SHIFTER(result, +3, 720401, k_integer_bits, k_fractional_bits, remainder, size);           \
+        result = EXP_BARREL_SHIFTER(result, +4, 242, k_integer_bits, k_fractional_bits, remainder, size);              \
+                                                                                                                       \
+        if (k_integer_bits > 5)                                                                                        \
+        {                                                                                                              \
+            const VEC_DATA_TYPE(int, size) clamp = -(1 << (k_fractional_bits + 5));                                    \
+            result = ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_NON_ZERO(a < clamp, size), 0, result, size);                \
+        }                                                                                                              \
+                                                                                                                       \
+        const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                               \
+        return ASYMM_SELECT_USING_MASK(ASYMM_MASK_IF_ZERO(a, size), Q0_one, result, size);                             \
     }
 
 /** Calculates the product of a integer value by a power of two, with either a positive exponent
@@ -297,26 +303,27 @@
  *
  * @return Arithmetic left or right shift.
  */
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                                                  \
-    inline VEC_DATA_TYPE(int, size) asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
-    {                                                                                                                      \
-        if(exponent < 0)                                                                                                   \
-        {                                                                                                                  \
-            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                                                      \
-        }                                                                                                                  \
-        \
-        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                                                      \
-        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                                                      \
-        int threshold = ((1 << (31 - exponent)) - 1);                                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                                                       \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                                                      \
-        VEC_DATA_TYPE(int, size)                                                                                           \
-        result = x << exponent;                                                                                            \
-        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                                                \
-        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                                                \
-        return result;                                                                                                     \
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2_IMPL(size)                                      \
+    inline VEC_DATA_TYPE(int, size)                                                            \
+        asymm_saturating_rounding_mult_by_pow2##size(VEC_DATA_TYPE(int, size) x, int exponent) \
+    {                                                                                          \
+        if (exponent < 0)                                                                      \
+        {                                                                                      \
+            return ASYMM_ROUNDING_DIVIDE_BY_POW2(x, -exponent, size);                          \
+        }                                                                                      \
+                                                                                               \
+        const VEC_DATA_TYPE(int, size) min = INT_MIN;                                          \
+        const VEC_DATA_TYPE(int, size) max = INT_MAX;                                          \
+        int threshold                      = ((1 << (31 - exponent)) - 1);                     \
+        VEC_DATA_TYPE(int, size)                                                               \
+        positive_mask = ASYMM_MASK_IF_NON_ZERO(x > threshold, size);                           \
+        VEC_DATA_TYPE(int, size)                                                               \
+        negative_mask = ASYMM_MASK_IF_NON_ZERO(x < -threshold, size);                          \
+        VEC_DATA_TYPE(int, size)                                                               \
+        result = x << exponent;                                                                \
+        result = ASYMM_SELECT_USING_MASK(positive_mask, max, result, size);                    \
+        result = ASYMM_SELECT_USING_MASK(negative_mask, min, result, size);                    \
+        return result;                                                                         \
     }
 
 /** Calculates (a+b)/2, rounded to the nearest integer.
@@ -326,20 +333,21 @@
  *
  * @return (a+b)/2, rounded to the nearest integer.
  */
-#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                                                \
-    inline VEC_DATA_TYPE(int, size) asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
-    {                                                                                                                     \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        a64 = convert_long##size(a);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        b64 = convert_long##size(b);                                                                                      \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sum = a64 + b64;                                                                                                  \
-        const VEC_DATA_TYPE(long, size) one       = 1;                                                                    \
-        const VEC_DATA_TYPE(long, size) minus_one = -1;                                                                   \
-        VEC_DATA_TYPE(long, size)                                                                                         \
-        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));                                      \
-        return convert_int##size((sum + sign) / 2);                                                                       \
+#define ASYMM_ROUNDING_HALF_SUM_IMPL(size)                                                    \
+    inline VEC_DATA_TYPE(int, size)                                                           \
+        asymm_rounding_half_sum##size(VEC_DATA_TYPE(int, size) a, VEC_DATA_TYPE(int, size) b) \
+    {                                                                                         \
+        VEC_DATA_TYPE(long, size)                                                             \
+        a64 = convert_long##size(a);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        b64 = convert_long##size(b);                                                          \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sum                                       = a64 + b64;                                \
+        const VEC_DATA_TYPE(long, size) one       = 1;                                        \
+        const VEC_DATA_TYPE(long, size) minus_one = -1;                                       \
+        VEC_DATA_TYPE(long, size)                                                             \
+        sign = select(minus_one, one, (SELECT_VEC_DATA_TYPE(long, size))(sum >= 0));          \
+        return convert_int##size((sum + sign) / 2);                                           \
     }
 
 /** Calculates \f$ 1 / (1 + x) \f$ for x in (0, 1).
@@ -354,12 +362,12 @@
         const VEC_DATA_TYPE(int, size) Q0_one = INT_MAX;                                                     \
         const VEC_DATA_TYPE(int, size) Q2_one = 1 << (31 - 2);                                               \
         VEC_DATA_TYPE(int, size)                                                                             \
-        half_denominator = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);                                         \
+        half_denominator                                 = ASYMM_ROUNDING_HALF_SUM(a, Q0_one, size);         \
         const VEC_DATA_TYPE(int, size) Q2_48_over_17     = 1515870810;                                       \
         const VEC_DATA_TYPE(int, size) Q2_neg_32_over_17 = -1010580540;                                      \
         VEC_DATA_TYPE(int, size)                                                                             \
         x = Q2_48_over_17 + ASYMM_MULT(half_denominator, Q2_neg_32_over_17, size);                           \
-        for(int i = 0; i < 3; i++)                                                                           \
+        for (int i = 0; i < 3; i++)                                                                          \
         {                                                                                                    \
             VEC_DATA_TYPE(int, size)                                                                         \
             half_denominator_times_x = ASYMM_MULT(half_denominator, x, size);                                \
@@ -378,48 +386,57 @@
  *
  * @return Rescaled value.
  */
-#define ASYMM_RESCALE_IMPL(size)                                                                                                    \
-    inline VEC_DATA_TYPE(int, size) asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
-    {                                                                                                                               \
-        int exponent = src_integer_bits - dst_integer_bits;                                                                         \
-        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                                                       \
+#define ASYMM_RESCALE_IMPL(size)                                                                        \
+    inline VEC_DATA_TYPE(int, size)                                                                     \
+        asymm_rescale##size(VEC_DATA_TYPE(int, size) value, int src_integer_bits, int dst_integer_bits) \
+    {                                                                                                   \
+        int exponent = src_integer_bits - dst_integer_bits;                                             \
+        return ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(value, exponent, size);                           \
     }
 
-#define QUANTIZE_STR(input, offset, scale, type, size) quantize_##type##size(input, offset, scale)
-#define QUANTIZE(input, offset, scale, type, size) QUANTIZE_STR(input, offset, scale, type, size)
+#define QUANTIZE_STR(input, offset, scale, type, size)   quantize_##type##size(input, offset, scale)
+#define QUANTIZE(input, offset, scale, type, size)       QUANTIZE_STR(input, offset, scale, type, size)
 #define DEQUANTIZE_STR(input, offset, scale, type, size) dequantize_##type##size(input, offset, scale)
-#define DEQUANTIZE(input, offset, scale, type, size) DEQUANTIZE_STR(input, offset, scale, type, size)
+#define DEQUANTIZE(input, offset, scale, type, size)     DEQUANTIZE_STR(input, offset, scale, type, size)
 
 #define ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size) asymm_rounding_divide_by_POW2_##size(x, exponent)
-#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size) ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
-#define ASYMM_MULT_STR(a, b, size) asymm_mult##size(a, b)
-#define ASYMM_MULT(a, b, size) ASYMM_MULT_STR(a, b, size)
+#define ASYMM_ROUNDING_DIVIDE_BY_POW2(x, exponent, size)     ASYMM_ROUNDING_DIVIDE_BY_POW2_STR(x, exponent, size)
+#define ASYMM_MULT_STR(a, b, size)                           asymm_mult##size(a, b)
+#define ASYMM_MULT(a, b, size)                               ASYMM_MULT_STR(a, b, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size) \
     ASYMM_MULT(x *((VEC_DATA_TYPE(int, size))(1) << (-left_shift)), quantized_multiplier, size)
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size) \
     ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(x, quantized_multiplier, size), right_shift, size)
-#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
-#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) asymm_select_using_mask##size(if_mask, then_val, else_val)
-#define ASYMM_MASK_IF_ZERO(a, size) asymm_mask_if_zero##size(a)
+#define ASYMM_EXP_ON_INTERVAL_BETWEEN_NEGATIVE_ONE_QUARTER_AND_0_EXCL(a, size) \
+    asymm_exp_on_interval_between_negative_one_quarter_and_0_excl##size(a)
+#define ASYMM_SELECT_USING_MASK(if_mask, then_val, else_val, size) \
+    asymm_select_using_mask##size(if_mask, then_val, else_val)
+#define ASYMM_MASK_IF_ZERO(a, size)     asymm_mask_if_zero##size(a)
 #define ASYMM_MASK_IF_NON_ZERO(a, size) asymm_mask_if_non_zero##size(a)
-#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
+#define EXP_BARREL_SHIFTER(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder, size) \
+    exp_barrel_shifter##size(result, exponent, fp_multiplier, k_integer_bits, k_fractional_bits, remainder)
 #define ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size) asymm_exp_on_negative_values##size(a, k_integer_bits)
-#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size) ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size) asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
-#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size) ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
-#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
+#define ASYMM_EXP_ON_NEGATIVE_VALUES(a, k_integer_bits, size)     ASYMM_EXP_ON_NEGATIVE_VALUES_STR(a, k_integer_bits, size)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)       asymm_one_over_one_plus_x_for_x_in_0_1##size(a)
+#define ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1(a, size)           ASYMM_ONE_OVER_ONE_PLUS_X_FOR_X_IN_0_1_STR(a, size)
+#define ASYMM_SATURATING_ROUNDING_MULT_BY_POW2(x, exponent, size) \
+    asymm_saturating_rounding_mult_by_pow2##size(x, exponent)
 #define ASYMM_ROUNDING_HALF_SUM(a, b, size) asymm_rounding_half_sum##size(a, b)
-#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
-#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
+#define ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size) \
+    asymm_rescale##size(value, src_integer_bits, dst_integer_bits)
+#define ASYMM_RESCALE(value, src_integer_bits, dst_integer_bits, size) \
+    ASYMM_RESCALE_STR(value, src_integer_bits, dst_integer_bits, size)
 
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                             \
-    inline VEC_DATA_TYPE(int, size) multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift) \
-    {                                                                                                                           \
-        const int left_shift  = shift > 0 ? shift : 0;                                                                          \
-        const int right_shift = shift > 0 ? 0 : -shift;                                                                         \
-        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size);             \
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER_IMPL(size)                                                                 \
+    inline VEC_DATA_TYPE(int, size)                                                                                 \
+        multiply_by_quantized_multiplier##size(VEC_DATA_TYPE(int, size) input, int qmul, int shift)                 \
+    {                                                                                                               \
+        const int left_shift  = shift > 0 ? shift : 0;                                                              \
+        const int right_shift = shift > 0 ? 0 : -shift;                                                             \
+        return ASYMM_ROUNDING_DIVIDE_BY_POW2(ASYMM_MULT(input * (1 << left_shift), qmul, size), right_shift, size); \
     }
-#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) multiply_by_quantized_multiplier##size(input, qmul, shift)
+#define MULTIPLY_BY_QUANTIZED_MULTIPLIER(input, qmul, shift, size) \
+    multiply_by_quantized_multiplier##size(input, qmul, shift)
 
 QUANTIZE_IMPL(uchar, 1)
 QUANTIZE_IMPL(char, 1)

diff --git a/src/core/CL/cl_kernels/load_store_utility.h b/src/core/CL/cl_kernels/load_store_utility.h
index 4ba2b2c..4daf0ad 100644
--- a/src/core/CL/cl_kernels/load_store_utility.h
+++ b/src/core/CL/cl_kernels/load_store_utility.h

@@ -223,8 +223,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group STORE_BLOCK
 
 /** Convert and store a block of the given size M0xN0
@@ -245,8 +247,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define CONVERT_STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    CONVERT_STORE_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** @} */ // end of group CONVERT_STORE_BLOCK
 
 /** Partially store the 0 to (n-1)th rows of the given variables
@@ -365,8 +369,10 @@
  * @param[in] Z         The offset in z-axis direction
  * @{
  */
-#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
-#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_ROW_PARTIAL_##STORE_M0(N0, STORE_N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
+#define STORE_BLOCK_PARTIAL(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) \
+    STORE_BLOCK_PARTIAL_STR(STORE_M0, STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 /** Store a block that can be partial in both x and y dimensions
  *
  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vstore(s) will be invoked, thus incurring small performance penalty.
@@ -388,22 +394,23 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
-#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                                                                            \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                                           \
-    }                                                                                                                                                     \
-    else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                                                                        \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                                             \
-    }                                                                                                                                                     \
-    else                                                                                                                                                  \
-    {                                                                                                                                                     \
-        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                               \
+#define STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0,     \
+                                       PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)                    \
+    if (!(PARTIAL_COND_X) && !(PARTIAL_COND_Y))                                                             \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                             \
+    }                                                                                                       \
+    else if ((PARTIAL_COND_Y) && !(PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else if (!(PARTIAL_COND_Y) && (PARTIAL_COND_X))                                                         \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);               \
+    }                                                                                                       \
+    else                                                                                                    \
+    {                                                                                                       \
+        STORE_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z); \
     }
 /** Store a block that can only be partial in x but not y.
  *
@@ -425,7 +432,7 @@
  * @param[in] PARTIAL_COND_X   Condition on the x axis to perform the partial store X. True to use PARTIAL_STORE_N0 rather than N0.
  */
 #define STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
-    if(!(PARTIAL_COND_X))                                                                                         \
+    if (!(PARTIAL_COND_X))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -453,7 +460,7 @@
  * @param[in] PARTIAL_COND_Y   Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
  */
 #define STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
-    if(!(PARTIAL_COND_Y))                                                                                         \
+    if (!(PARTIAL_COND_Y))                                                                                        \
     {                                                                                                             \
         STORE_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z);                                   \
     }                                                                                                             \
@@ -517,23 +524,28 @@
 #if defined(PARTIAL_STORE_M0) && defined(PARTIAL_STORE_N0)
 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case1: No partial blocks in either x or y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
 
 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
 // Case2: Partial blocks in y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
 
 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
 // Case3: Partial blocks in x
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
     STORE_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
 
 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 // Case4: Partial blocks in both x and y
-#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
-    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
+#define STORE_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)                                                    \
+    STORE_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, \
+                                   PARTIAL_COND_Y, PARTIAL_COND_X)
 
 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
 
@@ -560,8 +572,7 @@
 #define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
     ((uint)(max(0, (int)(y * M0) - (int)((M0 - PARTIAL_STORE_M0) % M0))))
 #else // defined(PARTIAL_STORE_M0)
-#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) \
-    ((uint)(y * M0))
+#define COMPUTE_M0_START_ROW(y, M0, PARTIAL_STORE_M0) ((uint)(y * M0))
 #endif    // defined(PARTIAL_STORE_M0)
 /** @} */ // end of group COMPUTE_M0_START_ROW
 

diff --git a/src/core/CL/cl_kernels/repeat.h b/src/core/CL/cl_kernels/repeat.h
index bed94a7..cb2f4b0 100644
--- a/src/core/CL/cl_kernels/repeat.h
+++ b/src/core/CL/cl_kernels/repeat.h

@@ -75,7 +75,9 @@
     P_X##_DEF(F, P_A, P_B, P_C);        \
     REPEAT_3_15(P_X, P_A, P_B, P_C)
 
-#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_3_##P_NUM(P_OP, P_A, P_B, P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C) \
+    REPEAT_3_##P_NUM(P_OP, P_A, P_B,               \
+                     P_C) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_3_N(P_NUM, P_OP, P_A, P_B, P_C) REPEAT_DEF_3_N(P_NUM, P_OP, P_A, P_B, P_C)
 
 // Repeat macros with 4 param, excluding the implicit ID param
@@ -126,52 +128,59 @@
     P_X##_DEF(F, P_A, P_B, P_C, P_D);        \
     REPEAT_4_15(P_X, P_A, P_B, P_C, P_D)
 
-#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C, P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
+#define REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) \
+    REPEAT_4_##P_NUM(P_OP, P_A, P_B, P_C,               \
+                     P_D) //One level of indirection to ensure order of expansion does not affect preprocessing P_NUM
 #define REPEAT_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D) REPEAT_DEF_4_N(P_NUM, P_OP, P_A, P_B, P_C, P_D)
 
 // Macro for initializing N variables. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL) TYPE VAR##ID = VAL
+#define VAR_INIT_TO_CONST_DEF(ID, TYPE, VAR, VAL)   TYPE VAR##ID = VAL
 #define REPEAT_VAR_INIT_TO_CONST(N, TYPE, VAR, VAL) REPEAT_3_N(N, VAR_INIT_TO_CONST, TYPE, VAR, VAL)
 
 // Macro for initializing N variables by converting the data type. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
-#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
+#define VAR_INIT_CONVERT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT)   TYPE_OUT VAR_OUT##ID = CONVERT(VAR_IN##ID, TYPE_OUT)
 #define REPEAT_VAR_INIT_CONVERT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for initializing N variables by converting the data type with saturation. Generates N statements that defines VAR##N = RHS_ACCESSOR_DEF(...)
 #define VAR_INIT_CONVERT_SAT_DEF(ID, TYPE_OUT, VAR_IN, VAR_OUT) TYPE_OUT VAR_OUT##ID = CONVERT_SAT(VAR_IN##ID, TYPE_OUT)
-#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
+#define REPEAT_VAR_INIT_CONVERT_SAT(N, TYPE_OUT, VAR_IN, VAR_OUT) \
+    REPEAT_3_N(N, VAR_INIT_CONVERT_SAT, TYPE_OUT, VAR_IN, VAR_OUT)
 
 // Macro for adding a constant to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID += (TYPE)VAL
+#define ADD_CONST_TO_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID += (TYPE)VAL
 #define REPEAT_ADD_CONST_TO_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, ADD_CONST_TO_VAR, TYPE, VAR, VAL)
 
 // Macro for multiplying N variables (VAR_B) by a constant (VAL) and adding to other N variables (VAR_A). Generates N statements that defines VAR_A##N =RHS_ACCESSOR_DEF(...)
-#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL) VAR_A##ID += VAR_B##ID * VAL
+#define MLA_VAR_WITH_CONST_VEC_DEF(ID, VAR_A, VAR_B, VAL)   VAR_A##ID += VAR_B##ID * VAL
 #define REPEAT_MLA_VAR_WITH_CONST_VEC(N, VAR_A, VAR_B, VAL) REPEAT_3_N(N, MLA_VAR_WITH_CONST_VEC, VAR_A, VAR_B, VAL)
 
 // Macro for adding a vector to N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_VECTOR_TO_VAR_DEF(ID, TYPE, VAR, VEC) VAR##ID += VEC
-#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC) REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
+#define REPEAT_ADD_VECTOR_TO_VAR(N, VAR, VEC)     REPEAT_3_N(N, ADD_VECTOR_TO_VAR, "", VAR, VEC)
 
 // Macro for adding a two N-variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
 #define ADD_TWO_VARS_DEF(ID, TYPE, VAR_A, VAR_B) VAR_A##ID += VAR_B##ID
-#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B) REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
+#define REPEAT_ADD_TWO_VARS(N, VAR_A, VAR_B)     REPEAT_3_N(N, ADD_TWO_VARS, "", VAR_A, VAR_B)
 
 // Macro for performing Max between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = max(VAR##ID, (TYPE)VAL)
+#define MAX_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = max(VAR##ID, (TYPE)VAL)
 #define REPEAT_MAX_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MAX_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing Min between a constant and N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL) VAR##ID = min(VAR##ID, (TYPE)VAL)
+#define MIN_CONST_VAR_DEF(ID, TYPE, VAR, VAL)   VAR##ID = min(VAR##ID, (TYPE)VAL)
 #define REPEAT_MIN_CONST_VAR(N, TYPE, VAR, VAL) REPEAT_3_N(N, MIN_CONST_VAR, TYPE, VAR, VAL)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE to N variables. Generates N statements that defines VAR##N =RHS_ACCESSOR_DEF(...)
-#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    VAR##ID = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, SIZE)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 // Macro for performing per-channel ASYMM_MULT_BY_QUANT_MULTIPLIER to N variables.
 #define ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL_DEF(ID, SIZE, VAR, RES_MUL, RES_SHIFT)                     \
@@ -182,6 +191,7 @@
         VAR##ID_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(VAR##ID, RES_MUL, RES_SHIFT, N0);    \
         VAR##ID           = select(VAR##ID_shift_lt0, VAR##ID_shift_gt0, RES_SHIFT >= 0);                     \
     })
-#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
+#define REPEAT_ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL(N, SIZE, VAR, RES_MUL, RES_SHIFT) \
+    REPEAT_4_N(N, ASYMM_MULT_BY_QUANT_MULTIPLIER_PER_CHANNEL, SIZE, VAR, RES_MUL, RES_SHIFT)
 
 #endif // ARM_COMPUTE_REPEAT_H

diff --git a/src/core/CL/cl_kernels/warp_helpers.h b/src/core/CL/cl_kernels/warp_helpers.h
index 642483a..6595bd1 100644
--- a/src/core/CL/cl_kernels/warp_helpers.h
+++ b/src/core/CL/cl_kernels/warp_helpers.h

@@ -31,11 +31,13 @@
  * @param[in] border_size Border size of the image
  *
  */
-inline const float8 clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
+inline const float8
+clamp_to_border_with_size(float8 coords, const float width, const float height, const float border_size)
 {
     const float4 clamped_x = clamp(coords.even, 0.0f - border_size, width - 1 + border_size);
     const float4 clamped_y = clamp(coords.odd, 0.0f - border_size, height - 1 + border_size);
-    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3, clamped_y.s3);
+    return (float8)(clamped_x.s0, clamped_y.s0, clamped_x.s1, clamped_y.s1, clamped_x.s2, clamped_y.s2, clamped_x.s3,
+                    clamped_y.s3);
 }
 
 /** Clamps the given coordinates to the borders.
@@ -74,7 +76,8 @@
  */
 inline const float8 get_neighbour_coords(const float2 coord)
 {
-    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1, /*br*/ coord.s0 + 1, coord.s1 + 1);
+    return (float8)(/*tl*/ coord.s0, coord.s1, /*tr*/ coord.s0 + 1, coord.s1, /*bl*/ coord.s0, coord.s1 + 1,
+                    /*br*/ coord.s0 + 1, coord.s1 + 1);
 }
 
 /** Computes the bilinear interpolation for each set of coordinates in the vector coords and returns the values
@@ -85,37 +88,38 @@
  * @param[in] height      Height of the image
  * @param[in] border_size Border size
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(const Image *in, const float8 coords, const float width, const float height, const float border_size)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate_with_border(
+    const Image *in, const float8 coords, const float width, const float height, const float border_size)
 {
     // If any of the 4 texels is out of the image's boundaries we use the border value (REPLICATE or CONSTANT) for any texel out of the image.
 
     // Sets the 4x4 coordinates for each of the four input texels
     const float8  fc = floor(coords);
-    const float16 c1 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
-    const float16 c2 = (float16)(
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
-                           clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
+    const float16 c1 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s0, fc.s1)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s2, fc.s3)), width, height, border_size));
+    const float16 c2 =
+        (float16)(clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s4, fc.s5)), width, height, border_size),
+                  clamp_to_border_with_size(get_neighbour_coords((float2)(fc.s6, fc.s7)), width, height, border_size));
 
     // Loads the values from the input image
     const float16 t = (float16)(
-                          /* tl, tr, bl, br */
-                          * ((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
-                          *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
-                          *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
-                          *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
-                          *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
-                          *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
-                          *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
-                          *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
-    const float8 a  = coords - fc;
-    const float8 b  = ((float8)(1.f)) - a;
-    const float4 fr = (float4)(
-                          ((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
-                          ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
-                          ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
-                          ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
+        /* tl, tr, bl, br */
+        *((__global DATA_TYPE *)offset(in, c1.s0, c1.s1)), *((__global DATA_TYPE *)offset(in, c1.s2, c1.s3)),
+        *((__global DATA_TYPE *)offset(in, c1.s4, c1.s5)), *((__global DATA_TYPE *)offset(in, c1.s6, c1.s7)),
+        *((__global DATA_TYPE *)offset(in, c1.s8, c1.s9)), *((__global DATA_TYPE *)offset(in, c1.sa, c1.sb)),
+        *((__global DATA_TYPE *)offset(in, c1.sc, c1.sd)), *((__global DATA_TYPE *)offset(in, c1.se, c1.sf)),
+        *((__global DATA_TYPE *)offset(in, c2.s0, c2.s1)), *((__global DATA_TYPE *)offset(in, c2.s2, c2.s3)),
+        *((__global DATA_TYPE *)offset(in, c2.s4, c2.s5)), *((__global DATA_TYPE *)offset(in, c2.s6, c2.s7)),
+        *((__global DATA_TYPE *)offset(in, c2.s8, c2.s9)), *((__global DATA_TYPE *)offset(in, c2.sa, c2.sb)),
+        *((__global DATA_TYPE *)offset(in, c2.sc, c2.sd)), *((__global DATA_TYPE *)offset(in, c2.se, c2.sf)));
+    const float8 a = coords - fc;
+    const float8 b = ((float8)(1.f)) - a;
+    const float4 fr =
+        (float4)(((t.s0 * b.s0 * b.s1) + (t.s1 * a.s0 * b.s1) + (t.s2 * b.s0 * a.s1) + (t.s3 * a.s0 * a.s1)),
+                 ((t.s4 * b.s2 * b.s3) + (t.s5 * a.s2 * b.s3) + (t.s6 * b.s2 * a.s3) + (t.s7 * a.s2 * a.s3)),
+                 ((t.s8 * b.s4 * b.s5) + (t.s9 * a.s4 * b.s5) + (t.sa * b.s4 * a.s5) + (t.sb * a.s4 * a.s5)),
+                 ((t.sc * b.s6 * b.s7) + (t.sd * a.s6 * b.s7) + (t.se * b.s6 * a.s7) + (t.sf * a.s6 * a.s7)));
     return CONVERT(fr, VEC_DATA_TYPE(DATA_TYPE, 4));
 }
 
@@ -126,7 +130,8 @@
  * @param[in] width  Width of the image
  * @param[in] height Height of the image
  */
-inline const VEC_DATA_TYPE(DATA_TYPE, 4) bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
+inline const VEC_DATA_TYPE(DATA_TYPE, 4)
+    bilinear_interpolate(const Image *in, const float8 coords, const float width, const float height)
 {
     return bilinear_interpolate_with_border(in, coords, width, height, 1);
 }

diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
index 2728958..5b72354 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,16 +45,20 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::S64);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Only ARG_IDX_MAX and ARG_IDX_MIN are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64, DataType::U64);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32, DataType::S64,
+                                                             DataType::U64);
     }
 
     return Status{};
@@ -66,22 +71,34 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLArgMinMaxLayerKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const ICLTensor   *input,
+                                       ICLTensor         *output,
+                                       unsigned int       axis,
+                                       ReductionOperation op)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLArgMinMaxLayerKernel::configure(const CLCompileContext &compile_context,
+                                       const ICLTensor        *input,
+                                       ICLTensor              *output,
+                                       unsigned int            axis,
+                                       ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape{ input->info()->tensor_shape() };
+    TensorShape output_shape{input->info()->tensor_shape()};
     output_shape.set(axis, 1);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(DataType::S32).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(DataType::S32)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
     _output         = output;
@@ -90,11 +107,14 @@
 
     // Set build options
     const auto adjusted_vector_size = adjust_vec_size(16U, input->info()->dimension(0));
-    const auto vector_size          = (adjusted_vector_size == 3U && axis == 0U) ? 2U : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
+    const auto vector_size          = (adjusted_vector_size == 3U && axis == 0U)
+                                          ? 2U
+                                          : adjusted_vector_size; // the opencl kernel only supports sizes 2, 4, 8 and 16.
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % vector_size));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % vector_size));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vector_size));
     build_opts.add_option_if(is_data_type_float(input->info()->data_type()), "-DFLOAT_DATA_TYPE");
     build_opts.add_option_if_else(op == ReductionOperation::ARG_IDX_MAX, "-DARG_MAX", "-DARG_MIN");
@@ -104,7 +124,7 @@
 
     // Create kernel
     std::string kernel_axis_name;
-    switch(axis)
+    switch (axis)
     {
         case 0:
             build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -135,7 +155,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLArgMinMaxLayerKernel::validate(const ITensorInfo *input,
+                                        const ITensorInfo *output,
+                                        unsigned int       axis,
+                                        ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
@@ -146,7 +169,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
@@ -154,7 +177,8 @@
             Window out_window(window);
             Window in_window(window);
             out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
-            in_window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+            in_window.set(Window::DimX,
+                          Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
             in_window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), 1u));
 
             // Get first input and output slices
@@ -166,15 +190,15 @@
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+            } while (in_window.slide_window_slice_2D(in_slice) && out_window.slide_window_slice_2D(out_slice));
         }
         break;
         case 1:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+            Window window_in{window};
+            window_in.set(Window::DimY,
+                          Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
             Window in_slice  = window_in.first_slice_window_2D();
             Window out_slice = window.first_slice_window_2D();
 
@@ -184,15 +208,15 @@
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
         }
         break;
         case 2:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+            Window window_in{window};
+            window_in.set(Window::DimZ,
+                          Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
             Window in_slice  = window_in.first_slice_window_3D();
             Window out_slice = window.first_slice_window_3D();
 
@@ -202,14 +226,13 @@
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
         }
         break;
         case 3:
         {
             // Get first input and output slices
-            Window window_in{ window };
+            Window window_in{window};
             window_in.set(3, Window::Dimension(0, 1, 1));
             Window in_slice  = window_in.first_slice_window_4D();
             Window out_slice = window.first_slice_window_4D();
@@ -220,8 +243,7 @@
                 add_4D_tensor_argument(idx, _input, in_slice);
                 add_4D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
         }
         break;
         default:

diff --git a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
index 5f36bdf..fb3b41b 100644
--- a/src/core/CL/kernels/CLArgMinMaxLayerKernel.h
+++ b/src/core/CL/kernels/CLArgMinMaxLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLARGMINMAXLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -72,7 +73,11 @@
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Only ArgMin and ArgMax are supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLArgMinMaxLayerKernel.
      *
@@ -84,7 +89,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
index 3fa8a8e..c88a852 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.cpp

@@ -23,58 +23,64 @@
  */
 #include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute;
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const ITensorInfo *mean, const ITensorInfo *var,
-                          const ITensorInfo *beta, const ITensorInfo *gamma,
-                          float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo  *input,
+                          const ITensorInfo  *output,
+                          const ITensorInfo  *mean,
+                          const ITensorInfo  *var,
+                          const ITensorInfo  *beta,
+                          const ITensorInfo  *gamma,
+                          float               epsilon,
+                          ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
-    if(beta != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+                                    input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    if (beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
     }
-    if(gamma != nullptr)
+    if (gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
     }
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
         ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32 && input->data_type() != DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+                                    act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+                                    act !=
+                                        ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
         ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -86,14 +92,15 @@
 
 std::pair<Status, Window> validate_and_configure_window_nchw(ITensorInfo *input, ITensorInfo *output)
 {
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->element_size(), input->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->element_size(), input->dimension(0));
 
     // Configure kernel window
     Window                 win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
 
     bool window_changed = false;
-    if(output != nullptr)
+    if (output != nullptr)
     {
         AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
         window_changed = update_window_and_padding(win, input_access, output_access);
@@ -104,30 +111,50 @@
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
 CLBatchNormalizationLayerKernel::CLBatchNormalizationLayerKernel()
-    : _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _beta(nullptr), _gamma(nullptr), _epsilon(0), _run_in_place(false)
+    : _input(nullptr),
+      _output(nullptr),
+      _mean(nullptr),
+      _var(nullptr),
+      _beta(nullptr),
+      _gamma(nullptr),
+      _epsilon(0),
+      _run_in_place(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBatchNormalizationLayerKernel::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(ICLTensor          *input,
+                                                ICLTensor          *output,
+                                                const ICLTensor    *mean,
+                                                const ICLTensor    *var,
+                                                const ICLTensor    *beta,
+                                                const ICLTensor    *gamma,
+                                                float               epsilon,
+                                                ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                                const ICLTensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void CLBatchNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *var,
+                                                const ICLTensor        *beta,
+                                                const ICLTensor        *gamma,
+                                                float                   epsilon,
+                                                ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
-    auto padding_info = get_padding_info({ input, output, mean, var, beta, gamma });
+    auto padding_info = get_padding_info({input, output, mean, var, beta, gamma});
     _input            = input;
     _output           = output;
     _mean             = mean;
@@ -142,13 +169,15 @@
                                                   mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
                                                   (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
@@ -157,29 +186,33 @@
     build_opts.add_option_if(gamma == nullptr, "-DUSE_DEFAULT_GAMMA");
 
     // Create kernel
-    _kernel = create_kernel(compile_context, "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel =
+        create_kernel(compile_context,
+                      "batchnormalization_layer_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                      build_opts.options());
 
     // Set kernel static arguments
     unsigned int include_output = (!_run_in_place) ? 1 : 0;
-    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() + 2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
-    if(_beta != nullptr)
+    unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor() +
+                       2 * num_arguments_per_1D_tensor(); // Skip the input and output parameters
+    if (_beta != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip beta parameter
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         idx += num_arguments_per_1D_tensor(); // Skip gamma parameter
     }
     _kernel.setArg<cl_float>(idx++, _epsilon);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
     // Configure kernel window
-    if(input->info()->data_layout() == DataLayout::NHWC)
+    if (input->info()->data_layout() == DataLayout::NHWC)
     {
         Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
         ICLKernel::configure_internal(win);
@@ -205,18 +238,23 @@
     _config_id += lower_string(string_from_data_layout(input->info()->data_layout()));
 }
 
-Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayerKernel::validate(const ITensorInfo  *input,
+                                                 const ITensorInfo  *output,
+                                                 const ITensorInfo  *mean,
+                                                 const ITensorInfo  *var,
+                                                 const ITensorInfo  *beta,
+                                                 const ITensorInfo  *gamma,
+                                                 float               epsilon,
+                                                 ActivationLayerInfo act_info)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
 
-    if(input->data_layout() != DataLayout::NHWC)
+    if (input->data_layout() != DataLayout::NHWC)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
-                                    .first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), (run_in_place) ? nullptr : output->clone().get())
+                .first);
     }
 
     return Status{};
@@ -236,11 +274,11 @@
     unsigned int idx            = (1 + include_output) * num_arguments_per_3D_tensor();
     add_1D_tensor_argument(idx, _mean, vector_slice);
     add_1D_tensor_argument(idx, _var, vector_slice);
-    if(_beta != nullptr)
+    if (_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _beta, vector_slice);
     }
-    if(_gamma != nullptr)
+    if (_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _gamma, vector_slice);
     }
@@ -249,11 +287,10 @@
     {
         idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }

diff --git a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
index acbe0f2..1a88d2a 100644
--- a/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchNormalizationLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLBATCHNORMALIZATIONLAYERKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,13 @@
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr, const ICLTensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ICLTensor          *input,
+                   ICLTensor          *output,
+                   const ICLTensor    *mean,
+                   const ICLTensor    *var,
+                   const ICLTensor    *beta     = nullptr,
+                   const ICLTensor    *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Set the input and output tensors.
      *
@@ -82,8 +89,15 @@
      * @param[in]      epsilon         (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta = nullptr,
-                   const ICLTensor *gamma = nullptr, float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *var,
+                   const ICLTensor        *beta     = nullptr,
+                   const ICLTensor        *gamma    = nullptr,
+                   float                   epsilon  = 0.001f,
+                   ActivationLayerInfo     act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchNormalizationLayerKernel
      *
      * @param[in] input    Source tensor info. In case of @p output tensor info = nullptr, this tensor will store the result.
@@ -99,10 +113,14 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
index 143a842..c640b5a 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.cpp

@@ -25,13 +25,14 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
-#include "arm_compute/core/TensorInfo.h"
 
 using namespace arm_compute::misc::shape_calculator;
 namespace arm_compute
@@ -46,7 +47,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -54,7 +55,11 @@
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const ITensorInfo *output,
+                                 const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -66,10 +71,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
-        const TensorInfo  expected_output       = output->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape = compute_batch_to_space_shape(
+            input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+        const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -79,8 +85,7 @@
 }
 } // namespace
 
-CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _output(nullptr)
+CLBatchToSpaceLayerKernel::CLBatchToSpaceLayerKernel() : _input(nullptr), _block_shape(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -90,11 +95,14 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    auto padding_info = get_padding_info({ input, block_shape, output });
+    auto padding_info = get_padding_info({input, block_shape, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), output->info()));
 
@@ -106,8 +114,9 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
-    _kernel = create_kernel(compile_context, "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
-
+    _kernel = create_kernel(compile_context,
+                            "batch_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -116,47 +125,65 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayerKernel::configure(const ICLTensor *input,
+                                          const int32_t    block_shape_x,
+                                          const int32_t    block_shape_y,
+                                          ICLTensor       *output,
+                                          const CropInfo  &crop_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output,
-                                          const CropInfo &crop_info)
+void CLBatchToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int32_t           block_shape_x,
+                                          const int32_t           block_shape_y,
+                                          ICLTensor              *output,
+                                          const CropInfo         &crop_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+    const TensorShape output_shape = compute_batch_to_space_shape(
+        input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
 
     _input  = input;
     _output = output;
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(3)));
     build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
     build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
     build_opts.add_option("-DCROP_LEFT=" + support::cpp11::to_string(crop_info.left));
     build_opts.add_option("-DCROP_TOP=" + support::cpp11::to_string(crop_info.top));
-    _kernel = create_kernel(compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(
+        compile_context, "batch_to_space_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
     return Status{};
 }
 
-Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status CLBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+                                           const int32_t      block_shape_x,
+                                           const int32_t      block_shape_y,
+                                           const ITensorInfo *output,
+                                           const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
@@ -185,7 +212,7 @@
         unsigned int idx = 0;
         add_4D_tensor_argument(idx, _input, slice_in);
         add_argument(idx, batch_id);
-        if(_block_shape != nullptr)
+        if (_block_shape != nullptr)
         {
             add_1D_tensor_argument(idx, _block_shape, vector_slice);
         }
@@ -193,7 +220,6 @@
         enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
index a05184c..b9d3e66 100644
--- a/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLBatchToSpaceLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLBATCHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -65,7 +66,10 @@
      *
      * @deprecated This method for dynamic block shape is not fully mature and will be removed in 23.08 release
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   ICLTensor              *output);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -74,7 +78,11 @@
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info);
+    void configure(const ICLTensor *input,
+                   const int32_t    block_shape_x,
+                   const int32_t    block_shape_y,
+                   ICLTensor       *output,
+                   const CropInfo  &crop_info);
     /** Initialise the kernel's inputs and output (Static block shape).
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,7 +92,12 @@
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info       Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int32_t block_shape_x, const int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int32_t           block_shape_x,
+                   const int32_t           block_shape_y,
+                   ICLTensor              *output,
+                   const CropInfo         &crop_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLBatchToSpaceLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -106,7 +119,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int32_t block_shape_x, const int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info);
+    static Status validate(const ITensorInfo *input,
+                           const int32_t      block_shape_x,
+                           const int32_t      block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLBitwiseKernel.cpp b/src/core/CL/kernels/CLBitwiseKernel.cpp
index 11e6d02..de3fb43 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.cpp
+++ b/src/core/CL/kernels/CLBitwiseKernel.cpp

@@ -28,25 +28,29 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLBitwiseKernel::CLBitwiseKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLBitwiseKernel::CLBitwiseKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBitwiseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op)
+void CLBitwiseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input1,
+                                const ICLTensor        *input2,
+                                ICLTensor              *output,
+                                BitwiseOperation        op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
-    if(op != BitwiseOperation::NOT)
+    if (op != BitwiseOperation::NOT)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8);
@@ -56,7 +60,7 @@
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*(output->info()), *(input1->info()));
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({input1, input2, output});
 
     // Configure kernel window
     const unsigned int vec_size_x = adjust_vec_size(16 / output->info()->element_size(), output->info()->dimension(0));
@@ -68,7 +72,7 @@
 
     // Create kernel
     std::string kernel_name = "";
-    switch(op)
+    switch (op)
     {
         case BitwiseOperation::AND:
             kernel_name = "bitwise_and";
@@ -107,13 +111,12 @@
     {
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, _input1, slice);
-        if(_input2 != nullptr)
+        if (_input2 != nullptr)
         {
             add_2D_tensor_argument(idx, _input2, slice);
         }
         add_2D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLBitwiseKernel.h b/src/core/CL/kernels/CLBitwiseKernel.h
index c5a9996..2c74955 100644
--- a/src/core/CL/kernels/CLBitwiseKernel.h
+++ b/src/core/CL/kernels/CLBitwiseKernel.h

@@ -59,7 +59,11 @@
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  op              Bitwise operation to perform. Supported: AND, OR, NOT, XOR.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, BitwiseOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   BitwiseOperation        op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
index 72de854..f32c518 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -40,7 +41,10 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo              *boxes,
+                          const ITensorInfo              *pred_boxes,
+                          const ITensorInfo              *deltas,
+                          const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(boxes);
@@ -53,7 +57,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
 
     const bool is_qasymm16 = boxes->data_type() == DataType::QASYMM16;
-    if(is_qasymm16)
+    if (is_qasymm16)
     {
         const UniformQuantizationInfo boxes_qinfo = boxes->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(boxes_qinfo.scale != 0.125f);
@@ -65,12 +69,12 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
     }
 
-    if(pred_boxes->total_size() > 0)
+    if (pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, boxes);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
-        if(is_qasymm16)
+        if (is_qasymm16)
         {
             const UniformQuantizationInfo pred_boxes_qinfo = pred_boxes->quantization_info().uniform();
             ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes_qinfo.scale != 0.125f);
@@ -83,22 +87,31 @@
 }
 } // namespace
 
-CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel()
-    : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
+CLBoundingBoxTransformKernel::CLBoundingBoxTransformKernel() : _boxes(nullptr), _pred_boxes(nullptr), _deltas(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLBoundingBoxTransformKernel::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransformKernel::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransformKernel::configure(const CLCompileContext         &compile_context,
+                                             const ICLTensor                *boxes,
+                                             ICLTensor                      *pred_boxes,
+                                             const ICLTensor                *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
-    auto padding_info = get_padding_info({ boxes, pred_boxes, deltas });
-    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+    auto padding_info = get_padding_info({boxes, pred_boxes, deltas});
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()
+                                                ->clone()
+                                                ->set_data_type(boxes->info()->data_type())
+                                                .set_quantization_info(boxes->info()->quantization_info()));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
@@ -128,7 +141,7 @@
     build_opts.add_option_if(info.apply_scale(), "-DSCALE_AFTER=" + float_to_string_with_full_precision(info.scale()));
     build_opts.add_option_if(info.correct_transform_coords(), "-DOFFSET=1");
 
-    if(is_quantized)
+    if (is_quantized)
     {
         build_opts.add_option("-DDATA_TYPE_DELTAS=" + get_cl_type_from_data_type(deltas->info()->data_type()));
         const UniformQuantizationInfo boxes_qinfo      = boxes->info()->quantization_info().uniform();
@@ -148,12 +161,15 @@
 
     // Since the number of columns is a multiple of 4 by definition, we don't need to pad the tensor
     const unsigned int num_elems_processed_per_iteration = 4;
-    Window             win                               = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
+    Window             win = calculate_max_window(*deltas->info(), Steps(num_elems_processed_per_iteration));
     ICLKernel::configure_internal(win);
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransformKernel::validate(const ITensorInfo              *boxes,
+                                              const ITensorInfo              *pred_boxes,
+                                              const ITensorInfo              *deltas,
+                                              const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
     return Status{};

diff --git a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
index 08f350e..9a1bb49 100644
--- a/src/core/CL/kernels/CLBoundingBoxTransformKernel.h
+++ b/src/core/CL/kernels/CLBoundingBoxTransformKernel.h

@@ -58,7 +58,10 @@
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -71,7 +74,11 @@
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info);
+    void configure(const CLCompileContext         &compile_context,
+                   const ICLTensor                *boxes,
+                   ICLTensor                      *pred_boxes,
+                   const ICLTensor                *deltas,
+                   const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -85,7 +92,10 @@
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
index a2a0bc4..ec58bf9 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,15 +47,19 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
 
-    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int channels =
+        input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        num_groups == channels,
+        "Channel shuffling with same number of groups as number of channels would be inefficient");
     // There cannot be more groups than channels
     ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+                                    "The number of channels must be a multiple of the number of groups");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -70,11 +75,12 @@
     auto_init_if_empty(*output, *input->clone());
 
     const bool is_nhwc = input->data_layout() == DataLayout::NHWC;
-    if(is_nhwc)
+    if (is_nhwc)
     {
-        unsigned int num_elems_processed_per_iteration_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        Window       win                                 = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
-        Window       win_collapsed                       = win.collapse(win, Window::DimZ);
+        unsigned int num_elems_processed_per_iteration_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x));
+        Window win_collapsed = win.collapse(win, Window::DimZ);
         return std::make_pair(Status{}, win_collapsed);
     }
     else
@@ -83,22 +89,25 @@
         constexpr unsigned int num_elems_processed_per_iteration_y = 2;
 
         // Configure kernel window
-        Window                win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
-        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+        Window win = calculate_max_window(
+            *input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration_x,
+                                           num_elems_processed_per_iteration_y);
+        AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x,
+                                            num_elems_processed_per_iteration_y);
 
         const bool window_changed = update_window_and_padding(win, input_access, output_access);
 
         Window win_collapsed = win.collapse(win, Window::DimZ);
 
-        Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+        Status err =
+            (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
         return std::make_pair(err, win_collapsed);
     }
 }
 } // namespace
 
-CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLChannelShuffleLayerKernel::CLChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -108,23 +117,27 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayerKernel::configure(const CLCompileContext &compile_context,
+                                            const ICLTensor        *input,
+                                            ICLTensor              *output,
+                                            unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), num_groups));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    const DataLayout   data_layout          = input->info()->data_layout();
-    const bool         is_nhwc              = data_layout == DataLayout::NHWC;
-    const unsigned int channels             = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    unsigned int       vec_size_x           = 0;
-    unsigned int       vec_size_x_leftovers = 0;
-    if(is_nhwc)
+    const DataLayout   data_layout = input->info()->data_layout();
+    const bool         is_nhwc     = data_layout == DataLayout::NHWC;
+    const unsigned int channels =
+        input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    unsigned int vec_size_x           = 0;
+    unsigned int vec_size_x_leftovers = 0;
+    if (is_nhwc)
     {
-        vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+        vec_size_x = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
         vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
     }
     else
@@ -170,13 +183,14 @@
     _config_id += support::cpp11::to_string(output->info()->dimension(1));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(2));
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
     }
 }
 
-Status CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+CLChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);

diff --git a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
index 31c007f..43c939e 100644
--- a/src/core/CL/kernels/CLChannelShuffleLayerKernel.h
+++ b/src/core/CL/kernels/CLChannelShuffleLayerKernel.h

@@ -60,7 +60,10 @@
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  num_groups      Number of groups. Must be greater than 1 and the number of channels of the tensors must be a multiple of the number of groups.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            num_groups);
     /** Static function to check if given info will lead to a valid configuration of @ref CLChannelShuffleLayerKernel
      *
      * @param[in] input      Input tensor info. Data types supported: All.

diff --git a/src/core/CL/kernels/CLComparisonKernel.cpp b/src/core/CL/kernels/CLComparisonKernel.cpp
index f4d6316..f272707 100644
--- a/src/core/CL/kernels/CLComparisonKernel.cpp
+++ b/src/core/CL/kernels/CLComparisonKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,14 +39,10 @@
 namespace
 {
 // Create supported comparisons map
-const std::map<ComparisonOperation, std::string> supported_comparison_ops =
-{
-    { ComparisonOperation::Equal, "EQUAL" },
-    { ComparisonOperation::NotEqual, "NOTEQUAL" },
-    { ComparisonOperation::Greater, "GREATER" },
-    { ComparisonOperation::GreaterEqual, "GREATEREQUAL" },
-    { ComparisonOperation::Less, "LESS" },
-    { ComparisonOperation::LessEqual, "LESSEQUAL" },
+const std::map<ComparisonOperation, std::string> supported_comparison_ops = {
+    {ComparisonOperation::Equal, "EQUAL"},     {ComparisonOperation::NotEqual, "NOTEQUAL"},
+    {ComparisonOperation::Greater, "GREATER"}, {ComparisonOperation::GreaterEqual, "GREATEREQUAL"},
+    {ComparisonOperation::Less, "LESS"},       {ComparisonOperation::LessEqual, "LESSEQUAL"},
 };
 
 int calculate_num_elems_processed_per_iteration(const ITensorInfo &input)
@@ -53,7 +50,10 @@
     return 16 / input.element_size();
 }
 
-Status validate_arguments(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output, ComparisonOperation operation)
+Status validate_arguments(const ITensorInfo  &input1,
+                          const ITensorInfo  &input2,
+                          const ITensorInfo  &output,
+                          ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&input1);
     ARM_COMPUTE_RETURN_ERROR_ON(input1.data_type() == DataType::UNKNOWN);
@@ -64,7 +64,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured output
-    if(output.total_size() > 0)
+    if (output.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&output, 1, DataType::U8);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, output.tensor_shape(), 0),
@@ -76,7 +76,7 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
 {
-    const TensorShape &out_shape                         = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
+    const TensorShape &out_shape = TensorShape::broadcast_shape(input1.tensor_shape(), input2.tensor_shape());
     const unsigned int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(input1);
 
     // Auto initialize output if not initialized
@@ -90,27 +90,34 @@
     AccessWindowHorizontal input2_access(&input2, 0, num_elems_processed_per_iteration);
     AccessWindowHorizontal output_access(&output, 0, num_elems_processed_per_iteration);
 
-    bool window_changed = update_window_and_padding(win_input1, input1_access)
-                          || update_window_and_padding(win_input2, input2_access)
-                          || update_window_and_padding(win, output_access);
+    bool window_changed = update_window_and_padding(win_input1, input1_access) ||
+                          update_window_and_padding(win_input2, input2_access) ||
+                          update_window_and_padding(win, output_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
-CLComparisonKernel::CLComparisonKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+CLComparisonKernel::CLComparisonKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComparisonKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const ICLTensor    *input1,
+                                   const ICLTensor    *input2,
+                                   ICLTensor          *output,
+                                   ComparisonOperation operation)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparisonKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparisonKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input1,
+                                   const ICLTensor        *input2,
+                                   ICLTensor              *output,
+                                   ComparisonOperation     operation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*input1->info(), *input2->info(), *output->info(), operation));
@@ -129,10 +136,11 @@
     // Set kernel build options
     std::set<std::string> build_opts;
     build_opts.emplace("-DDATA_TYPE=" + get_cl_type_from_data_type(input1->info()->data_type()));
-    build_opts.emplace("-DVEC_SIZE=" + support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
+    build_opts.emplace("-DVEC_SIZE=" +
+                       support::cpp11::to_string(calculate_num_elems_processed_per_iteration(*input1->info())));
     build_opts.emplace("-DOP=" + operation_name);
     build_opts.emplace("-DOP_NAME=" + lower_string(operation_name));
-    if(is_data_type_quantized(input1->info()->data_type()))
+    if (is_data_type_quantized(input1->info()->data_type()))
     {
         const UniformQuantizationInfo iq1_info = input1->info()->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = input2->info()->quantization_info().uniform();
@@ -160,12 +168,16 @@
     _config_id += lower_string(string_from_data_layout(input1->info()->data_layout()));
 }
 
-Status CLComparisonKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparisonKernel::validate(const ITensorInfo  *input1,
+                                    const ITensorInfo  *input2,
+                                    const ITensorInfo  *output,
+                                    ComparisonOperation operation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*input1, *input2, *output, operation));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(*input1->clone(), *input2->clone(), *output->clone()).first);
 
     return Status{};
 }
@@ -181,17 +193,18 @@
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -212,16 +225,16 @@
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 BorderSize CLComparisonKernel::border_size() const
 {
     const int num_elems_processed_per_iteration = calculate_num_elems_processed_per_iteration(*_input1->info());
 
-    const unsigned int replicateSize = _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
-    const unsigned int border        = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
-    return BorderSize{ 0, border, 0, 0 };
+    const unsigned int replicateSize =
+        _output->info()->dimension(0) - std::min(_input1->info()->dimension(0), _input2->info()->dimension(0));
+    const unsigned int border = std::min<unsigned int>(num_elems_processed_per_iteration - 1U, replicateSize);
+    return BorderSize{0, border, 0, 0};
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLComparisonKernel.h b/src/core/CL/kernels/CLComparisonKernel.h
index 0b94190..174a6c9 100644
--- a/src/core/CL/kernels/CLComparisonKernel.h
+++ b/src/core/CL/kernels/CLComparisonKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLCOMPARISONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,11 @@
      * @param[out] output          Destination tensor. Data types supported: U8.
      * @param[in]  operation       Comparison operation to use.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ComparisonOperation operation);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input1,
+                   const ICLTensor        *input2,
+                   ICLTensor              *output,
+                   ComparisonOperation     operation);
     /** Static function to check if given info will lead to a valid configuration of @ref CLComparisonKernel
      *
      * @param[in] input1    Source tensor. Data types supported: All.
@@ -74,10 +79,13 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation);
+    static Status validate(const ITensorInfo  *input1,
+                           const ITensorInfo  *input2,
+                           const ITensorInfo  *output,
+                           ComparisonOperation operation);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void       run(const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:

diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
index 76af5d5..f8ecc4c 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -40,7 +41,8 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
+Status CLDeconvolutionLayerUpsampleKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *output,
                                                     const PadStrideInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
@@ -60,7 +62,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(idx_h) == 0);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_c) != output->dimension(idx_c));
-    for(size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 3; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(i) != output->dimension(i));
     }
@@ -68,20 +70,21 @@
     return Status{};
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                                                   const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsampleKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   ICLTensor              *output,
+                                                   const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(CLDeconvolutionLayerUpsampleKernel::validate(input->info(), output->info(), info));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input       = input;
     _output      = output;
@@ -119,7 +122,7 @@
     const int out_end_y   = _output->info()->dimension(idx_h) - _info.pad_bottom() + _info.stride().second - 1;
     const int out_step_y  = _info.stride().second;
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -137,8 +140,7 @@
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
+            } while (collapsed.slide_window_slice_3D(slice_in) && collapsed.slide_window_slice_3D(slice_out));
             break;
         }
         case DataLayout::NHWC:
@@ -156,8 +158,7 @@
                 add_3D_tensor_argument(idx, _input, slice_in);
                 add_3D_tensor_argument(idx, _output, slice_out);
                 enqueue(queue, *this, slice_out, lws_hint());
-            }
-            while(window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
+            } while (window.slide_window_slice_3D(slice_in) && window.slide_window_slice_3D(slice_out));
             break;
         }
         default:

diff --git a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
index e0d1322..762989a 100644
--- a/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h

@@ -62,7 +62,10 @@
      * @param[out] output          Destination tensor. Data types supported: same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info            Contains padding and stride information described in @ref PadStrideInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PadStrideInfo &info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PadStrideInfo    &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDeconvolutionLayerUpsample
      *
      * @param[in] input  Source tensor info. Data types supported: All.

diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
index 0fc0ff8..b33e0a8 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.cpp

@@ -27,9 +27,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -38,7 +39,11 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *bias,
+                          const ITensorInfo   *output,
+                          const ITensorInfo   *input_info,
+                          const ITensorInfo   *weights_info,
                           const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
@@ -53,19 +58,21 @@
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_w) != deconv_info.stride().first);
     ARM_COMPUTE_RETURN_ERROR_ON(weights_info->dimension(idx_h) != deconv_info.stride().second);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32);
-    if(!is_qasymm)
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S32);
+    if (!is_qasymm)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_info, weights_info);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) * weights_info->dimension(idx_b));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != weights_info->dimension(idx_w) * weights_info->dimension(idx_h) *
+                                                           weights_info->dimension(idx_b));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != input_info->dimension(idx_w));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != input_info->dimension(idx_h));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(3) != input_info->dimension(idx_b));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_qasymm)
+        if (is_qasymm)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -76,19 +83,26 @@
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights_info->dimension(idx_b));
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-        auto                out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+        auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h),
+                                                        stride_info);
 
-        const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
     }
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo   *input,
+                                                        ITensorInfo         *output,
+                                                        const ITensorInfo   *input_info,
+                                                        const ITensorInfo   *weights_info,
+                                                        const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -97,11 +111,17 @@
     const size_t        idx_h       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
 
-    auto out_dims = deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h), weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
+    auto out_dims =
+        deconvolution_output_dimensions(input_info->dimension(idx_w), input_info->dimension(idx_h),
+                                        weights_info->dimension(idx_w), weights_info->dimension(idx_h), stride_info);
 
-    const TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input_info, *weights_info);
 
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout).set_quantization_info(input->quantization_info()));
+    auto_init_if_empty(*output, input->clone()
+                                    ->set_tensor_shape(output_shape)
+                                    .set_data_layout(data_layout)
+                                    .set_quantization_info(input->quantization_info()));
 
     Window win = calculate_max_window(*input);
 
@@ -109,29 +129,37 @@
 }
 } // namespace
 
-CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel()
-    : _add_bias(false),
-      _bias(nullptr)
+CLDeconvolutionReshapeOutputKernel::CLDeconvolutionReshapeOutputKernel() : _add_bias(false), _bias(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+void CLDeconvolutionReshapeOutputKernel::configure(const ICLTensor     *input,
+                                                   const ICLTensor     *bias,
+                                                   ICLTensor           *output,
+                                                   const ITensorInfo   *input_info,
+                                                   const ITensorInfo   *weights_info,
                                                    const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, input_info, weights_info, deconv_info);
 }
 
-void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info,
-                                                   const ITensorInfo   *weights_info,
-                                                   const PadStrideInfo &deconv_info)
+void CLDeconvolutionReshapeOutputKernel::configure(const CLCompileContext &compile_context,
+                                                   const ICLTensor        *input,
+                                                   const ICLTensor        *bias,
+                                                   ICLTensor              *output,
+                                                   const ITensorInfo      *input_info,
+                                                   const ITensorInfo      *weights_info,
+                                                   const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, input_info, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), input_info, weights_info, deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr ? bias->info() : nullptr),
+                                                  output->info(), input_info, weights_info, deconv_info));
 
-    auto padding_info = get_padding_info({ input, bias, output });
+    auto padding_info = get_padding_info({input, bias, output});
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
+    auto win_config =
+        validate_and_configure_window(input->info(), output->info(), input_info, weights_info, deconv_info);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     const DataLayout data_layout = input_info->data_layout();
@@ -178,7 +206,11 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
+Status CLDeconvolutionReshapeOutputKernel::validate(const ITensorInfo   *input,
+                                                    const ITensorInfo   *bias,
+                                                    const ITensorInfo   *output,
+                                                    const ITensorInfo   *input_info,
+                                                    const ITensorInfo   *weights_info,
                                                     const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, input_info, weights_info, deconv_info));
@@ -194,7 +226,7 @@
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input, collapsed);
     add_3D_tensor_argument(idx, _output, collapsed);
-    if(_add_bias)
+    if (_add_bias)
     {
         add_1D_tensor_argument(idx, _bias, collapsed);
     }

diff --git a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
index ce354fa..8f436b0 100644
--- a/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h
+++ b/src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h

@@ -67,7 +67,12 @@
      * @param[in]  weights_info Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info  Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    void configure(const ICLTensor     *input,
+                   const ICLTensor     *bias,
+                   ICLTensor           *output,
+                   const ITensorInfo   *input_info,
+                   const ITensorInfo   *weights_info,
+                   const PadStrideInfo &deconv_info);
     /** Initialise the kernel's source and destination.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -79,8 +84,13 @@
      * @param[in]  weights_info    Deconvolution weights tensor info. Supported data types: same as @p input.  Supported data layouts: same as @p input.
      * @param[in]  deconv_info     Contains padding and policies to be used in the deconvolution, this is described in @ref PadStrideInfo. This kernel supports only stride_x = weights.width && stride_y = weights.height. Moreover, padding is not supported.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const ITensorInfo *input_info, const ITensorInfo *weights_info,
-                   const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *bias,
+                   ICLTensor              *output,
+                   const ITensorInfo      *input_info,
+                   const ITensorInfo      *weights_info,
+                   const PadStrideInfo    &deconv_info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref  CLDeconvolutionReshapeOutputKernel.
      *
@@ -93,7 +103,12 @@
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const ITensorInfo *input_info, const ITensorInfo *weights_info, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *bias,
+                           const ITensorInfo   *output,
+                           const ITensorInfo   *input_info,
+                           const ITensorInfo   *weights_info,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
index 5c1dc4f..cdf19ab 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -49,12 +50,14 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                    (block_shape * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                    (block_shape * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -63,8 +66,7 @@
 }
 } // namespace
 
-CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLDepthToSpaceLayerKernel::CLDepthToSpaceLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -74,14 +76,18 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape =
+        compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), block_shape));
 
@@ -98,7 +104,9 @@
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(input->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "depth_to_space_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
@@ -137,7 +145,6 @@
         enqueue(queue, *this, slice_in, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_in));
+    } while (window.slide_window_slice_3D(slice_in));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
index 1f7f77b..cef70c4 100644
--- a/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h
+++ b/src/core/CL/kernels/CLDepthToSpaceLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLDEPTHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthToSpaceLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
index e34b692..b95abe7 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.cpp

@@ -23,16 +23,17 @@
  */
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/ICLKernel.h"
@@ -45,12 +46,18 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
-                          const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo          *input,
+                          const ITensorInfo          *weights,
+                          const ITensorInfo          *biases,
+                          const ITensorInfo          *output,
+                          const DWCComputeKernelInfo &dwc_info,
+                          const ConvolutionInfo      &conv_info,
+                          const ITensorInfo          *output_multipliers,
+                          const ITensorInfo          *output_shifts)
 {
     ARM_COMPUTE_UNUSED(dwc_info);
     bool in_place = false;
-    if(output == nullptr || output == input)
+    if (output == nullptr || output == input)
     {
         in_place = true;
         output   = input;
@@ -58,11 +65,14 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && dwc_info.m0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && dwc_info.m0 != 1);
     ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_input_to_cl_image == true));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) && (export_to_cl_image(weights) == false), "Weights cannot be exported to cl_image!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((dwc_info.export_weights_to_cl_image == true) &&
+                                        (export_to_cl_image(weights) == false),
+                                    "Weights cannot be exported to cl_image!");
     ARM_COMPUTE_RETURN_ERROR_ON((dwc_info.export_weights_to_cl_image == true) && ((dwc_info.n0 % 4) != 0));
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().second < 1);
@@ -72,33 +82,40 @@
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_c) != (input->dimension(idx_c) * conv_info.depth_multiplier));
 
     // In place restrictions
-    if(in_place)
+    if (in_place)
     {
-        const int weights_width_idx  = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-        const int weights_height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U || weights->tensor_shape()[weights_height_idx] != 1U);
+        const int weights_width_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+        const int weights_height_idx =
+            get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[weights_width_idx] != 1U ||
+                                    weights->tensor_shape()[weights_height_idx] != 1U);
         ARM_COMPUTE_RETURN_ERROR_ON(conv_info.depth_multiplier != 1U);
         ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride() != std::make_pair(1U, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size2D(1U, 1U));
-        ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            conv_info.pad_stride_info
+                .has_padding()); // Note that in princple padding can be supported with in_place but we choose not to support it
     }
 
-    const ConvolutionInfo info{ conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(), conv_info.dilation };
-    const TensorShape     output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
+    const ConvolutionInfo info{conv_info.pad_stride_info, conv_info.depth_multiplier, ActivationLayerInfo(),
+                               conv_info.dilation};
+    const TensorShape     output_shape =
+        arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info);
 
-    if(conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
+    if (conv_info.depth_multiplier > 1 && dwc_info.n0 > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % dwc_info.n0) != 0);
     }
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != output_shape[idx_c]);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
 
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -108,7 +125,7 @@
         }
     }
 
-    if(is_quantized)
+    if (is_quantized)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output_multipliers, output_shifts);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
@@ -116,7 +133,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
 
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shape[idx_c] != output_multipliers->dimension(0));
@@ -134,22 +151,24 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
-    if(is_data_type_quantized(input->data_type()))
+    if (is_data_type_quantized(input->data_type()))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
-        const UniformQuantizationInfo oq_info = (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
+        const UniformQuantizationInfo oq_info =
+            (output->total_size() != 0) ? output->quantization_info().uniform() : iq_info;
 
         float multiplier        = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
 
     return Status{};
@@ -171,30 +190,48 @@
     _type = CLKernelType::DEPTHWISE;
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                        const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, dwc_info, conv_info,
+              output_multipliers, output_shifts);
 }
 
-void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                                        const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info,
-                                                        const ICLTensor *output_multipliers, const ICLTensor *output_shifts)
+void CLDepthwiseConvolutionLayerNativeKernel::configure(const CLCompileContext     &compile_context,
+                                                        ICLTensor                  *input,
+                                                        const ICLTensor            *weights,
+                                                        const ICLTensor            *biases,
+                                                        ICLTensor                  *output,
+                                                        const DWCComputeKernelInfo &dwc_info,
+                                                        const ConvolutionInfo      &conv_info,
+                                                        const ICLTensor            *output_multipliers,
+                                                        const ICLTensor            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    if(output == nullptr)
+    if (output == nullptr)
     {
         // In-place
         output = input;
     }
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                  dwc_info, conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr, (output_shifts != nullptr) ? output_shifts->info() : nullptr));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), dwc_info,
+        conv_info, (output_multipliers != nullptr) ? output_multipliers->info() : nullptr,
+        (output_shifts != nullptr) ? output_shifts->info() : nullptr));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(*(input->info()), *(weights->info()), conv_info);
-    auto_init_if_empty(*(output->info()), input->info()->clone()->set_tensor_shape(output_shape).set_quantization_info(output->info()->quantization_info()));
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape(
+        *(input->info()), *(weights->info()), conv_info);
+    auto_init_if_empty(*(output->info()), input->info()
+                                              ->clone()
+                                              ->set_tensor_shape(output_shape)
+                                              .set_quantization_info(output->info()->quantization_info()));
 
     _input                      = input;
     _output                     = output;
@@ -214,12 +251,12 @@
     CLBuildOptions build_opts;
 
     // Update the padding for the input/weights tensor if we can export to cl_image
-    if(_export_input_to_cl_image)
+    if (_export_input_to_cl_image)
     {
         arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(input->info());
     }
 
-    if(_export_weights_to_cl_image)
+    if (_export_weights_to_cl_image)
     {
         arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(weights->info());
     }
@@ -229,9 +266,10 @@
     const auto      act_function  = conv_info.act_info.activation();
     const auto      dst_data_type = _output->info()->data_type();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -268,23 +306,24 @@
     build_opts.add_option("-DN0=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(m0));
     build_opts.add_option("-DM0_A=" + support::cpp11::to_string(_weights->info()->dimension(1) + m0 - 1));
-    build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1", "-DN0_A=" + support::cpp11::to_string(n0));
+    build_opts.add_option_if_else(conv_info.depth_multiplier > 1, "-DN0_A=1",
+                                  "-DN0_A=" + support::cpp11::to_string(n0));
     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(_output->info()->dimension(0) % n0));
     build_opts.add_option_if(_input->info()->num_dimensions() > 3, "-DBATCHED_EXECUTION");
 
     // Force unroll with pragma when any of the following values exceed the maximum number of manual unroll
-    set_unroll_with_pragma(build_opts, { static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
-                                         static_cast<int>(_weights->info()->dimension(1)),
-                                         static_cast<int>(_weights->info()->dimension(2))
-                                       });
+    set_unroll_with_pragma(build_opts, {static_cast<int>(_weights->info()->dimension(1) + m0 - 1),
+                                        static_cast<int>(_weights->info()->dimension(1)),
+                                        static_cast<int>(_weights->info()->dimension(2))});
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_opts.add_option(std::string("-DHAS_BIAS"));
-        build_opts.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
+        build_opts.add_option(
+            std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->info()->data_type())));
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         kernel_name                          = "dwc_native_quantized_nhwc";
         const UniformQuantizationInfo iqinfo = input->info()->quantization_info().uniform();
@@ -306,13 +345,17 @@
         build_opts.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
         build_opts.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
         build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
-        build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" + get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
-        build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" + get_cl_type_from_data_type(_output_shifts->info()->data_type()));
-        build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL, "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
+        build_opts.add_option("-DDST_MULTIPLIERS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_multipliers->info()->data_type()));
+        build_opts.add_option("-DDST_SHIFTS_DATA_TYPE=" +
+                              get_cl_type_from_data_type(_output_shifts->info()->data_type()));
+        build_opts.add_option_if_else(weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL,
+                                      "-DQUANTIZATION_TYPE=PER_CHANNEL", "-DQUANTIZATION_TYPE=PER_TENSOR");
         // Note: We expect the input and output tensors to always adopt a per-tensor quantization approach
         int a_val{};
         int b_val{};
-        std::tie(b_val, a_val) = get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
+        std::tie(b_val, a_val) =
+            get_quantized_activation_min_max(conv_info.act_info, input->info()->data_type(), oqinfo);
 
         build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + support::cpp11::to_string(a_val));
         build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + support::cpp11::to_string(b_val));
@@ -321,8 +364,10 @@
     {
         kernel_name = "dwc_native_fp_nhwc";
         build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-        build_opts.add_option_if(conv_info.act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
-        build_opts.add_option_if(conv_info.act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DA_VAL=" + float_to_string_with_full_precision(conv_info.act_info.a()));
+        build_opts.add_option_if(conv_info.act_info.enabled(),
+                                 "-DB_VAL=" + float_to_string_with_full_precision(conv_info.act_info.b()));
     }
 
     Window win = calculate_max_window(*(output->info()), Steps(n0, m0));
@@ -350,10 +395,17 @@
     _config_id += string_from_data_type(input->info()->data_type());
 }
 
-Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                                         const DWCComputeKernelInfo &dwc_info, const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status CLDepthwiseConvolutionLayerNativeKernel::validate(const ITensorInfo          *input,
+                                                         const ITensorInfo          *weights,
+                                                         const ITensorInfo          *biases,
+                                                         const ITensorInfo          *output,
+                                                         const DWCComputeKernelInfo &dwc_info,
+                                                         const ConvolutionInfo      &conv_info,
+                                                         const ITensorInfo          *output_multipliers,
+                                                         const ITensorInfo          *output_shifts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, weights, biases, output, dwc_info, conv_info, output_multipliers, output_shifts));
     return Status{};
 }
 
@@ -370,47 +422,52 @@
     cl::Image2D input_cl_image;
     cl::Image2D weights_cl_image;
 
-    if(_export_input_to_cl_image || _export_weights_to_cl_image)
+    if (_export_input_to_cl_image || _export_weights_to_cl_image)
     {
         // Export cl_buffer to cl_image
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
-            const size_t      image_w = _input->info()->dimension(0) / 4;
-            const size_t      image_h = _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
+            const size_t image_w = _input->info()->dimension(0) / 4;
+            const size_t image_h =
+                _input->info()->dimension(1) * _input->info()->dimension(2) * _input->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _input->info()->strides_in_bytes()[1];
-            input_cl_image                    = create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d, _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+            input_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _input->cl_buffer(), shape2d,
+                                           _input->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         }
 
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
-            const size_t      image_w = _weights->info()->dimension(0) / 4;
-            const size_t      image_h = _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
+            const size_t image_w = _weights->info()->dimension(0) / 4;
+            const size_t image_h =
+                _weights->info()->dimension(1) * _weights->info()->dimension(2) * _weights->info()->dimension(3);
             const TensorShape shape2d(image_w, image_h);
             const size_t      image_row_pitch = _weights->info()->strides_in_bytes()[1];
-            weights_cl_image                  = create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d, _weights->info()->data_type(), image_row_pitch,
-                                                                           CLImage2DType::ReadOnly);
+            weights_cl_image =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), _weights->cl_buffer(), shape2d,
+                                           _weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         }
     }
 
     unsigned int idx = 0;
-    if(_export_input_to_cl_image)
+    if (_export_input_to_cl_image)
     {
         _kernel.setArg(idx++, input_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, _input);
     add_4d_tensor_nhwc_argument(idx, _output);
-    if(_export_weights_to_cl_image)
+    if (_export_weights_to_cl_image)
     {
         _kernel.setArg(idx++, weights_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, _weights);
-    if(_is_quantized)
+    if (_is_quantized)
     {
         add_1D_tensor_argument(idx, _output_multipliers, slice);
         add_1D_tensor_argument(idx, _output_shifts, slice);
     }
-    if(_biases != nullptr)
+    if (_biases != nullptr)
     {
         add_1D_tensor_argument(idx, _biases, slice);
     }

diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
index 8eee7b2..d34a662 100644
--- a/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h
+++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h

@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 #define ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 class ICLTensor;
@@ -74,15 +74,28 @@
      *          * no padding
      *          * no change of data layout after configure
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
-                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(const CLCompileContext     &compile_context,
+                   ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
      * Similar to @ref CLDepthwiseConvolutionLayerNativeKernel::configure()
      */
-    void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const DWCComputeKernelInfo &dwc_info,
-                   const ConvolutionInfo &conv_info, const ICLTensor *output_multipliers = nullptr, const ICLTensor *output_shifts = nullptr);
+    void configure(ICLTensor                  *input,
+                   const ICLTensor            *weights,
+                   const ICLTensor            *biases,
+                   ICLTensor                  *output,
+                   const DWCComputeKernelInfo &dwc_info,
+                   const ConvolutionInfo      &conv_info,
+                   const ICLTensor            *output_multipliers = nullptr,
+                   const ICLTensor            *output_shifts      = nullptr);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLDepthwiseConvolutionLayerNativeKernel
      *
@@ -90,23 +103,29 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const DWCComputeKernelInfo &dwc_info,
-                           const ConvolutionInfo &conv_info, const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo          *input,
+                           const ITensorInfo          *weights,
+                           const ITensorInfo          *biases,
+                           const ITensorInfo          *output,
+                           const DWCComputeKernelInfo &dwc_info,
+                           const ConvolutionInfo      &conv_info,
+                           const ITensorInfo          *output_multipliers = nullptr,
+                           const ITensorInfo          *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input {};
+    const ICLTensor *_input{};
     const ICLTensor *_weights{};
     const ICLTensor *_biases{};
     ICLTensor       *_output{};
-    unsigned int     _depth_multiplier{ 0 };
+    unsigned int     _depth_multiplier{0};
     const ICLTensor *_output_multipliers{};
     const ICLTensor *_output_shifts{};
-    bool             _export_input_to_cl_image{ false };
-    bool             _export_weights_to_cl_image{ true };
-    bool             _is_quantized{ false };
+    bool             _export_input_to_cl_image{false};
+    bool             _export_weights_to_cl_image{true};
+    bool             _is_quantized{false};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_CLDEPTHWISECONVOLUTIONLAYERNATIVEKERNEL_H */

diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
index 9b514ed..3d8f875 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -37,17 +38,20 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo               *input,
+                          const ITensorInfo               *output,
+                          const ITensorInfo               *idx,
+                          const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -57,7 +61,10 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo                     *input,
+                                                        ITensorInfo                     *output,
+                                                        ITensorInfo                     *idx,
+                                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(idx, config);
 
@@ -69,21 +76,27 @@
 }
 } // namespace
 
-CLFFTDigitReverseKernel::CLFFTDigitReverseKernel()
-    : _input(nullptr), _output(nullptr), _idx(nullptr)
+CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() : _input(nullptr), _output(nullptr), _idx(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, idx, config);
 }
 
-void CLFFTDigitReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config)
+void CLFFTDigitReverseKernel::configure(const CLCompileContext          &compile_context,
+                                        const ICLTensor                 *input,
+                                        ICLTensor                       *output,
+                                        const ICLTensor                 *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
-    auto padding_info = get_padding_info({ input, output, idx });
+    auto padding_info = get_padding_info({input, output, idx});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
 
     _input  = input;
@@ -114,10 +127,14 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status CLFFTDigitReverseKernel::validate(const ITensorInfo               *input,
+                                         const ITensorInfo               *output,
+                                         const ITensorInfo               *idx,
+                                         const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
 
     return Status{};
 }
@@ -137,7 +154,6 @@
         add_3D_tensor_argument(idx, _output, slice);
         add_1D_tensor_argument(idx, _idx, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.h b/src/core/CL/kernels/CLFFTDigitReverseKernel.h
index e5583a4..fdd1bcc 100644
--- a/src/core/CL/kernels/CLFFTDigitReverseKernel.h
+++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.h

@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 #define ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -56,7 +56,8 @@
      * @param[in]  idx    Digit reverse index tensor. Data type supported: U32
      * @param[in]  config Kernel configuration.
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void
+    configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -65,7 +66,11 @@
      * @param[in]  idx             Digit reverse index tensor. Data type supported: U32
      * @param[in]  config          Kernel configuration.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, const FFTDigitReverseKernelInfo &config);
+    void configure(const CLCompileContext          &compile_context,
+                   const ICLTensor                 *input,
+                   ICLTensor                       *output,
+                   const ICLTensor                 *idx,
+                   const FFTDigitReverseKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.
@@ -75,7 +80,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+    static Status validate(const ITensorInfo               *input,
+                           const ITensorInfo               *output,
+                           const ITensorInfo               *idx,
+                           const FFTDigitReverseKernelInfo &config);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
index 95f4b64..3729e6b 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,11 +47,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] % config.radix);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -59,9 +60,10 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
 {
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output, *input);
     }
@@ -76,8 +78,7 @@
 }
 } // namespace
 
-CLFFTRadixStageKernel::CLFFTRadixStageKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTRadixStageKernel::CLFFTRadixStageKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -87,11 +88,15 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTRadixStageKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config)
+void CLFFTRadixStageKernel::configure(const CLCompileContext        &compile_context,
+                                      ICLTensor                     *input,
+                                      ICLTensor                     *output,
+                                      const FFTRadixStageKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
-    auto padding_info = get_padding_info({ input, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -110,11 +115,12 @@
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments if not the first stage
-    if(!config.is_first_stage)
+    if (!config.is_first_stage)
     {
         const unsigned int Ni        = config.Nx * config.radix;
         const float        exp_const = (-2.0 * M_PI) / static_cast<float>(Ni);
-        unsigned int       idx       = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+        unsigned int       idx =
+            (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
         _kernel.setArg<cl_uint>(idx++, config.Nx);
         _kernel.setArg<cl_uint>(idx++, Ni);
         _kernel.setArg<cl_float>(idx, exp_const);
@@ -136,21 +142,22 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status CLFFTRadixStageKernel::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *output,
+                                       const FFTRadixStageKernelInfo &config)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (run_in_place) ? nullptr : output->clone().get(),
-                                                              config)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+            .first);
 
     return Status{};
 }
 
 std::set<unsigned int> CLFFTRadixStageKernel::supported_radix()
 {
-    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
 }
 
 void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue)
@@ -165,12 +172,11 @@
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.h b/src/core/CL/kernels/CLFFTRadixStageKernel.h
index 9bb310d..de80bfc 100644
--- a/src/core/CL/kernels/CLFFTRadixStageKernel.h
+++ b/src/core/CL/kernels/CLFFTRadixStageKernel.h

@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 #define ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 #include <set>
 
 namespace arm_compute
@@ -69,7 +69,10 @@
      * @param[out]    output          Destination tensor. Can be nullptr. Data type supported: same as @p input
      * @param[in]     config          FFT descriptor metadata.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelInfo &config);
+    void configure(const CLCompileContext        &compile_context,
+                   ICLTensor                     *input,
+                   ICLTensor                     *output,
+                   const FFTRadixStageKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.

diff --git a/src/core/CL/kernels/CLFFTScaleKernel.cpp b/src/core/CL/kernels/CLFFTScaleKernel.cpp
index 8a714d7..be6e16b 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.cpp
+++ b/src/core/CL/kernels/CLFFTScaleKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,7 +44,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -54,8 +55,7 @@
 }
 } // namespace
 
-CLFFTScaleKernel::CLFFTScaleKernel()
-    : _input(nullptr), _output(nullptr), _run_in_place(false)
+CLFFTScaleKernel::CLFFTScaleKernel() : _input(nullptr), _output(nullptr), _run_in_place(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -65,11 +65,14 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFTScaleKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config)
+void CLFFTScaleKernel::configure(const CLCompileContext   &compile_context,
+                                 ICLTensor                *input,
+                                 ICLTensor                *output,
+                                 const FFTScaleKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input        = input;
     _output       = output;
@@ -78,20 +81,22 @@
     // Create kernel
     CLBuildOptions build_opts;
     build_opts.add_option_if(_run_in_place, "-DIN_PLACE");
-    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels() : input->info()->num_channels()));
+    build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(output != nullptr ? output->info()->num_channels()
+                                                                                      : input->info()->num_channels()));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option_if(config.conjugate, "-DCONJ");
     std::string kernel_name = "fft_scale_conj";
     _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set static arguments
-    unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
+    unsigned int idx =
+        (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters
     _kernel.setArg<cl_float>(idx, config.scale);
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps());
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
@@ -130,12 +135,11 @@
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, _output, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLFFTScaleKernel.h b/src/core/CL/kernels/CLFFTScaleKernel.h
index cc518be..b995282 100644
--- a/src/core/CL/kernels/CLFFTScaleKernel.h
+++ b/src/core/CL/kernels/CLFFTScaleKernel.h

@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLFFTSCALEKERNEL_H
 #define ARM_COMPUTE_CLFFTSCALEKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -63,7 +63,10 @@
      * @param[out]    output          Destination tensor. Data type supported: same as @p input
      * @param[in]     config          Kernel configuration
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const FFTScaleKernelInfo &config);
+    void configure(const CLCompileContext   &compile_context,
+                   ICLTensor                *input,
+                   ICLTensor                *output,
+                   const FFTScaleKernelInfo &config);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFFTScaleKernel
      *
      * @param[in] input  Source tensor info. Data types supported: F16/F32.

diff --git a/src/core/CL/kernels/CLFillBorderKernel.cpp b/src/core/CL/kernels/CLFillBorderKernel.cpp
index fcd99a4..86bb502 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.cpp
+++ b/src/core/CL/kernels/CLFillBorderKernel.cpp

@@ -31,14 +31,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
-CLFillBorderKernel::CLFillBorderKernel()
-    : ICLKernel(), _tensor(nullptr)
+CLFillBorderKernel::CLFillBorderKernel() : ICLKernel(), _tensor(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -56,27 +56,38 @@
     ICLKernel::add_argument<T>(idx, static_cast<T>(value));
 }
 
-void CLFillBorderKernel::configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(ICLTensor        *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     configure(CLKernelLibrary::get().get_compile_context(), tensor, border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ICLTensor              *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     _tensor = tensor;
     configure(compile_context, tensor->info(), border_size, border_mode, constant_border_value);
 }
 
-void CLFillBorderKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void CLFillBorderKernel::configure(const CLCompileContext &compile_context,
+                                   ITensorInfo            *tensor,
+                                   BorderSize              border_size,
+                                   BorderMode              border_mode,
+                                   const PixelValue       &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
     ARM_COMPUTE_ERROR_ON(tensor->num_channels() != 1);
-    auto padding_info = get_padding_info({ tensor });
+    auto padding_info = get_padding_info({tensor});
 
     border_size.limit(tensor->padding());
 
     // If there is no border: early exit
-    if(border_size.empty() || border_mode == BorderMode::UNDEFINED)
+    if (border_size.empty() || border_mode == BorderMode::UNDEFINED)
     {
         return;
     }
@@ -98,25 +109,22 @@
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Create static kernel arguments
-    const unsigned int valid_width  = tensor->valid_region().shape[0];
-    const unsigned int valid_height = tensor->valid_region().shape[1];
-    const cl_int2      valid_region_coords =
-    {
-        {
-            static_cast<cl_int>(tensor->valid_region().anchor[0]),
-            static_cast<cl_int>(tensor->valid_region().anchor[1]),
-        }
-    };
-    const unsigned int total_valid_width = border_size.left + valid_width + border_size.right;
+    const unsigned int valid_width         = tensor->valid_region().shape[0];
+    const unsigned int valid_height        = tensor->valid_region().shape[1];
+    const cl_int2      valid_region_coords = {{
+             static_cast<cl_int>(tensor->valid_region().anchor[0]),
+             static_cast<cl_int>(tensor->valid_region().anchor[1]),
+    }};
+    const unsigned int total_valid_width   = border_size.left + valid_width + border_size.right;
 
     // Set static kernel arguments
     unsigned int idx = num_arguments_per_3D_tensor(); //Skip the tensor parameters
     ICLKernel::add_argument<cl_uint>(idx, valid_width);
     ICLKernel::add_argument<cl_uint>(idx, valid_height);
     ICLKernel::add_argument<cl_int2>(idx, valid_region_coords);
-    if(BorderMode::CONSTANT == border_mode)
+    if (BorderMode::CONSTANT == border_mode)
     {
-        switch(dt)
+        switch (dt)
         {
             case DataType::U8:
             case DataType::QASYMM8:
@@ -175,12 +183,13 @@
 void CLFillBorderKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
 
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
 
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
@@ -193,14 +202,13 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 void CLFillBorderKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     // Border mode undefined or border width == 0
-    if(_kernel() == nullptr)
+    if (_kernel() == nullptr)
     {
         return;
     }
@@ -216,7 +224,6 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLFillBorderKernel.h b/src/core/CL/kernels/CLFillBorderKernel.h
index 7951f48..5782143 100644
--- a/src/core/CL/kernels/CLFillBorderKernel.h
+++ b/src/core/CL/kernels/CLFillBorderKernel.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -57,7 +58,11 @@
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in,out] tensor                Tensor to process Data types supported: U8/QASYMM8/S8/QASYMM8_SIGNED/U16/S16/U32/S32/F16/F32.
@@ -65,7 +70,10 @@
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(ICLTensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ICLTensor        *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
     /** Initialise the kernel's input, output and border mode.
      *
      * @param[in]     compile_context       The compile context to be used.
@@ -74,7 +82,11 @@
      * @param[in]     border_mode           Border mode to use for the convolution.
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   BorderSize              border_size,
+                   BorderMode              border_mode,
+                   const PixelValue       &constant_border_value = PixelValue());
 
     /** Function to set the constant value on fill border kernel depending on type.
      *

diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
index 68fe324..7da0679 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.cpp

@@ -30,20 +30,26 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo         *input_weights,
+                          const ITensorInfo         *bn_mean,
+                          const ITensorInfo         *bn_var,
+                          const ITensorInfo         *fused_weights,
+                          const ITensorInfo         *fused_bias,
+                          const ITensorInfo         *input_bias,
+                          const ITensorInfo         *bn_beta,
+                          const ITensorInfo         *bn_gamma,
+                          float                      epsilon,
+                          FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -54,43 +60,44 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
 
-    if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+    if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
     }
     else
     {
-        const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t channel_idx =
+            get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
     }
 
     // Validate bias
-    if(input_bias != nullptr)
+    if (input_bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
     }
     // Validate beta
-    if(bn_beta != nullptr)
+    if (bn_beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
     }
     // Validate gamma
-    if(bn_gamma != nullptr)
+    if (bn_gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
     }
     // Validate output weights
-    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    if (fused_weights != nullptr && fused_weights->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
     }
     // Validate output bias
-    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    if (fused_bias != nullptr && fused_bias->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -101,28 +108,52 @@
 } // namespace
 
 CLFuseBatchNormalizationKernel::CLFuseBatchNormalizationKernel()
-    : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
-      _run_in_place_weights(false), _run_in_place_bias(false)
+    : _input_weights(nullptr),
+      _input_bias(nullptr),
+      _bn_mean(nullptr),
+      _bn_var(nullptr),
+      _bn_gamma(nullptr),
+      _bn_beta(nullptr),
+      _fused_weights(nullptr),
+      _fused_bias(nullptr),
+      _epsilon(),
+      _run_in_place_weights(false),
+      _run_in_place_bias(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLFuseBatchNormalizationKernel::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                               ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                               const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalizationKernel::configure(const CLCompileContext    &compile_context,
+                                               const ICLTensor           *input_weights,
+                                               const ICLTensor           *bn_mean,
+                                               const ICLTensor           *bn_var,
+                                               ICLTensor                 *fused_weights,
+                                               ICLTensor                 *fused_bias,
+                                               const ICLTensor           *input_bias,
+                                               const ICLTensor           *bn_beta,
+                                               const ICLTensor           *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
-    auto padding_info = get_padding_info({ input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma });
+    auto padding_info =
+        get_padding_info({input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma});
 
     _input_weights = input_weights;
     _input_bias    = input_bias;
@@ -135,28 +166,28 @@
     _epsilon       = epsilon;
 
     _run_in_place_weights = (fused_weights == nullptr) || (fused_weights == input_weights);
-    _run_in_place_bias    = (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
+    _run_in_place_bias =
+        (input_bias != nullptr && fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
 
     // Auto initialize outputs
-    if(_fused_weights != nullptr)
+    if (_fused_weights != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
     }
-    if(_fused_bias != nullptr)
+    if (_fused_bias != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
     }
 
     // Validate arguments
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
-                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
-                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
-                                                  (input_bias != nullptr) ? input_bias->info() : nullptr,
-                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
-                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
-                                                  epsilon, fbn_type));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_weights->info(), bn_mean->info(), bn_var->info(),
+        (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+        (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+        (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+        fbn_type));
 
     // Configure kernel window
     Window win = calculate_max_window(*input_weights->info());
@@ -165,7 +196,8 @@
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input_weights->info()->data_type()));
-    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION, "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
+    build_opts.add_option_if(fbn_type == FuseBatchNormalizationType::CONVOLUTION,
+                             "-DDIM2=" + support::cpp11::to_string(input_weights->info()->dimension(2)));
     build_opts.add_option("-DEPSILON=" + float_to_string_with_full_precision(epsilon));
     build_opts.add_option_if(_input_weights->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
     build_opts.add_option_if(_run_in_place_weights, "-DIN_PLACE_W");
@@ -180,12 +212,19 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                                const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                                float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalizationKernel::validate(const ITensorInfo         *input_weights,
+                                                const ITensorInfo         *bn_mean,
+                                                const ITensorInfo         *bn_var,
+                                                const ITensorInfo         *fused_weights,
+                                                const ITensorInfo         *fused_bias,
+                                                const ITensorInfo         *input_bias,
+                                                const ITensorInfo         *bn_beta,
+                                                const ITensorInfo         *bn_gamma,
+                                                float                      epsilon,
+                                                FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                   input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
     return Status{};
 }
 
@@ -202,25 +241,25 @@
     // Add kernel arguments
     unsigned int idx = 0;
     add_3D_tensor_argument(idx, _input_weights, slice_3d);
-    if(_input_bias != nullptr)
+    if (_input_bias != nullptr)
     {
         add_1D_tensor_argument(idx, _input_bias, slice_1d);
     }
     add_1D_tensor_argument(idx, _bn_mean, slice_1d);
     add_1D_tensor_argument(idx, _bn_var, slice_1d);
-    if(!_run_in_place_weights)
+    if (!_run_in_place_weights)
     {
         add_3D_tensor_argument(idx, _fused_weights, slice_3d);
     }
-    if(!_run_in_place_bias)
+    if (!_run_in_place_bias)
     {
         add_1D_tensor_argument(idx, _fused_bias, slice_1d);
     }
-    if(_bn_beta != nullptr)
+    if (_bn_beta != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_beta, slice_1d);
     }
-    if(_bn_gamma != nullptr)
+    if (_bn_gamma != nullptr)
     {
         add_1D_tensor_argument(idx, _bn_gamma, slice_1d);
     }

diff --git a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
index 78b1e74..76ec7a7 100644
--- a/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h
+++ b/src/core/CL/kernels/CLFuseBatchNormalizationKernel.h

@@ -62,9 +62,16 @@
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Set the source, destination of the kernel
      *
      * @param[in]  compile_context The compile context to be used.
@@ -81,9 +88,17 @@
      * @param[in]  epsilon         (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type        (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var, ICLTensor *fused_weights, ICLTensor *fused_bias,
-                   const ICLTensor *input_bias = nullptr, const ICLTensor *bn_beta = nullptr, const ICLTensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input_weights,
+                   const ICLTensor           *bn_mean,
+                   const ICLTensor           *bn_var,
+                   ICLTensor                 *fused_weights,
+                   ICLTensor                 *fused_bias,
+                   const ICLTensor           *input_bias = nullptr,
+                   const ICLTensor           *bn_beta    = nullptr,
+                   const ICLTensor           *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref CLFuseBatchNormalizationKernel
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -101,10 +116,16 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLGatherKernel.cpp b/src/core/CL/kernels/CLGatherKernel.cpp
index 5495023..c11a189 100644
--- a/src/core/CL/kernels/CLGatherKernel.cpp
+++ b/src/core/CL/kernels/CLGatherKernel.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLGatherKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -34,7 +36,8 @@
 {
 namespace
 {
-inline Status validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+inline Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
@@ -43,11 +46,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON(actual_axis >= input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+            input->tensor_shape(), indices->tensor_shape(), actual_axis);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
     }
 
@@ -56,12 +60,14 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *indices, ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
     const uint32_t actual_axis = wrap_around(axis, static_cast<int>(input->num_dimensions()));
     // Output auto initialization if not yet initialized
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), actual_axis);
+    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+        input->tensor_shape(), indices->tensor_shape(), actual_axis);
     auto_init_if_empty((*output), output_shape, 1, input->data_type());
 
     // Create window
@@ -72,8 +78,7 @@
 
 } // namespace
 
-CLGatherKernel::CLGatherKernel()
-    : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
+CLGatherKernel::CLGatherKernel() : _input(nullptr), _indices(nullptr), _output(nullptr), _axis(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -83,10 +88,14 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGatherKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGatherKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               const ICLTensor        *indices,
+                               ICLTensor              *output,
+                               int                     axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, indices);
-    auto padding_info = get_padding_info({ input, output, indices });
+    auto padding_info = get_padding_info({input, output, indices});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), indices->info(), output->info(), axis));
 
     // Configure kernel window
@@ -100,7 +109,8 @@
 
     // Set build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DOUTPUT_DIM_Z=" + support::cpp11::to_string(output->info()->dimension(2)));
     build_opts.add_option("-DINDICES_DIM_Z=" + support::cpp11::to_string(indices->info()->dimension(2)));
     build_opts.add_option("-DINPUT_DIM_Z=" + support::cpp11::to_string(input->info()->dimension(2)));
@@ -114,10 +124,12 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+CLGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), indices->clone().get(), output->clone().get(), axis).first);
     return Status{};
 }
 

diff --git a/src/core/CL/kernels/CLGatherKernel.h b/src/core/CL/kernels/CLGatherKernel.h
index 8f472a4..db4b49d 100644
--- a/src/core/CL/kernels/CLGatherKernel.h
+++ b/src/core/CL/kernels/CLGatherKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLGATHERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@
      * @param[out] output          Destination tensor. Data type supported: Same as @p input
      * @param[in]  axis            (Optional) The axis in @p input to gather @p indices from. Negative values wrap around. Defaults to 0
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis = 0);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   int                     axis = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLGatherKernel
      *
@@ -74,7 +79,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis = 0);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
index 088c454..b9ff72b 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,7 +48,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
-    if(all_anchors->total_size() > 0)
+    if (all_anchors->total_size() > 0)
     {
         size_t feature_height = info.feat_height();
         size_t feature_width  = info.feat_width();
@@ -57,7 +58,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
 
-        if(is_data_type_quantized(anchors->data_type()))
+        if (is_data_type_quantized(anchors->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
         }
@@ -66,21 +67,25 @@
 }
 } // namespace
 
-CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel()
-    : _anchors(nullptr), _all_anchors(nullptr)
+CLComputeAllAnchorsKernel::CLComputeAllAnchorsKernel() : _anchors(nullptr), _all_anchors(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComputeAllAnchorsKernel::configure(const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), anchors, all_anchors, info);
 }
 
-void CLComputeAllAnchorsKernel::configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info)
+void CLComputeAllAnchorsKernel::configure(const CLCompileContext   &compile_context,
+                                          const ICLTensor          *anchors,
+                                          ICLTensor                *all_anchors,
+                                          const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(anchors, all_anchors);
-    auto padding_info = get_padding_info({ anchors, all_anchors });
+    auto padding_info = get_padding_info({anchors, all_anchors});
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(anchors->info(), all_anchors->info(), info));
 
     // Metadata
@@ -91,7 +96,8 @@
 
     // Initialize the output if empty
     const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
-    auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+    auto_init_if_empty(*all_anchors->info(),
+                       TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
 
     // Set instance variables
     _anchors     = anchors;
@@ -108,7 +114,7 @@
     build_opts.add_option("-DNUM_ANCHORS=" + support::cpp11::to_string(num_anchors));
     build_opts.add_option("-DNUM_ROI_FIELDS=" + support::cpp11::to_string(info.values_per_roi()));
 
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
         build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
@@ -116,8 +122,9 @@
     }
 
     // Create kernel
-    const std::string kernel_name = (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
-    _kernel                       = create_kernel(compile_context, kernel_name, build_opts.options());
+    const std::string kernel_name =
+        (is_quantized) ? "generate_proposals_compute_all_anchors_quantized" : "generate_proposals_compute_all_anchors";
+    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // The tensor all_anchors can be interpreted as an array of structs (each structs has values_per_roi fields).
     // This means we don't need to pad on the X dimension, as we know in advance how many fields
@@ -127,7 +134,9 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status CLComputeAllAnchorsKernel::validate(const ITensorInfo        *anchors,
+                                           const ITensorInfo        *all_anchors,
+                                           const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
     return Status{};

diff --git a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
index d26795a..e08f281 100644
--- a/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h
+++ b/src/core/CL/kernels/CLGenerateProposalsLayerKernel.h

@@ -62,7 +62,10 @@
      * @param[in]  info            Contains Compute Anchors operation information described in @ref ComputeAnchorsInfo
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *anchors, ICLTensor *all_anchors, const ComputeAnchorsInfo &info);
+    void configure(const CLCompileContext   &compile_context,
+                   const ICLTensor          *anchors,
+                   ICLTensor                *all_anchors,
+                   const ComputeAnchorsInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLComputeAllAnchorsKernel
      *
@@ -81,5 +84,5 @@
     const ICLTensor *_anchors;
     ICLTensor       *_all_anchors;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif // ARM_COMPUTE_CLGENERATEPROSPOSALSLAYERKERNEL_H

diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
index 7ed323c..b13eb16 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -39,17 +40,20 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status validate_arguments(const ITensorInfo                          *input,
+                          const ITensorInfo                          *output,
+                          const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.epsilon == 0.f, "Epsilon must be different than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
 
     return Status{};
@@ -59,27 +63,30 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
 
     return Status{};
 }
 } // namespace
 
-CLComputeMeanVariance::CLComputeMeanVariance()
-    : _input(nullptr), _output(nullptr)
+CLComputeMeanVariance::CLComputeMeanVariance() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLComputeMeanVariance::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision)
+void CLComputeMeanVariance::configure(const CLCompileContext &compile_context,
+                                      ICLTensor              *input,
+                                      ICLTensor              *output,
+                                      bool                    use_mixed_precision)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output == nullptr ? input : output;
@@ -88,7 +95,8 @@
     const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size();
 
     CLBuildOptions build_opts;
-    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" +
+                          (use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
@@ -108,7 +116,7 @@
     const TensorShape  out_shape(input_channel, 2u, input_batches);
 
     // Output auto initialization if not yet initialized
-    if(use_mixed_precision)
+    if (use_mixed_precision)
     {
         auto_init_if_empty(*_output->info(), out_shape, 1, DataType::F32);
     }
@@ -134,7 +142,7 @@
     Window collapsed_window = window.collapse(window, Window::DimZ);
 
     // We will process the planes together
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if (_input->info()->data_layout() == DataLayout::NCHW)
     {
         collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
         collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -157,10 +165,14 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void CLInstanceNormalizationLayerKernel::configure(const CLCompileContext                     &compile_context,
+                                                   ICLTensor                                  *input,
+                                                   ICLTensor                                  *mean_var,
+                                                   ICLTensor                                  *output,
+                                                   const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output == nullptr ? input : output;
@@ -172,7 +184,9 @@
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision ? "float" : get_cl_type_from_data_type(input->info()->data_type())));
+    build_opts.add_option("-DINTERNAL_DATA_TYPE=" + (info.use_mixed_precision
+                                                         ? "float"
+                                                         : get_cl_type_from_data_type(input->info()->data_type())));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DDIM_X=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DDIM_Y=" + support::cpp11::to_string(input->info()->dimension(1)));
@@ -188,7 +202,7 @@
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(1));
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type());
     }
@@ -197,7 +211,9 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status CLInstanceNormalizationLayerKernel::validate(const ITensorInfo                          *input,
+                                                    const ITensorInfo                          *output,
+                                                    const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info));
     return Status{};
@@ -211,7 +227,7 @@
     Window collapsed_window = window.collapse(window, Window::DimZ);
 
     // We will process the planes together
-    if(_input->info()->data_layout() == DataLayout::NCHW)
+    if (_input->info()->data_layout() == DataLayout::NCHW)
     {
         collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
         collapsed_window.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -226,7 +242,7 @@
     add_4D_tensor_argument(idx, _input, collapsed_window);
     add_3D_tensor_argument(idx, _mean, collapsed_window);
 
-    if(!_run_in_place)
+    if (!_run_in_place)
     {
         add_4D_tensor_argument(idx, _output, collapsed_window);
     }

diff --git a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
index 2f9014a..9f436da 100644
--- a/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h

@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 #define ARM_COMPUTE_CLINSTANCENORMALIZATIONLAYERKERNEL_H
 
-#include "src/core/CL/ICLKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/CL/ICLKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations
@@ -59,7 +59,11 @@
      * @param[out]     output          Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      info            Kernel meta-data descriptor
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *mean_var, ICLTensor *output, const InstanceNormalizationLayerKernelInfo &info);
+    void configure(const CLCompileContext                     &compile_context,
+                   ICLTensor                                  *input,
+                   ICLTensor                                  *mean_var,
+                   ICLTensor                                  *output,
+                   const InstanceNormalizationLayerKernelInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *
@@ -69,7 +73,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -106,7 +111,8 @@
      * @param[out]     output              Destination tensor. Data types and data layouts supported: same as @p input.
      * @param[in]      use_mixed_precision Use mixed precision in case of FP16 execution
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
+    void
+    configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, bool use_mixed_precision);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLInstanceNormalizationLayer.
      *

diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
index 542d380..9ed9d7c 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.cpp

@@ -31,10 +31,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -43,7 +43,8 @@
 {
 constexpr int max_input_tensor_dim = 3;
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
@@ -53,14 +54,15 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+                                    "Actual normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
     TensorShape sum_shape = input->tensor_shape();
     sum_shape.set(actual_axis, 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -78,16 +80,22 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLL2NormalizeLayerKernel::configure(const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(
+    const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, sum, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayerKernel::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *sum,
+                                         ICLTensor              *output,
+                                         int                     axis,
+                                         float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
-    auto padding_info = get_padding_info({ input, sum, output });
+    auto padding_info = get_padding_info({input, sum, output});
 
     _input       = input;
     _sum         = sum;
@@ -95,8 +103,9 @@
     _actual_axis = wrap_around(axis, max_input_tensor_dim);
     _epsilon     = epsilon;
 
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
-    const int          vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    const unsigned int vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    const int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
 
     // Set build options
     CLBuildOptions build_opts;
@@ -107,7 +116,7 @@
     // Create kernel
     std::string  kernel_name;
     unsigned int idx = 0;
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
             kernel_name = "l2_normalize_x";
@@ -127,7 +136,7 @@
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Set epsilon argument
-    if(input->info()->data_type() == DataType::F32)
+    if (input->info()->data_type() == DataType::F32)
     {
         _kernel.setArg<cl_float>(idx, _epsilon);
     }
@@ -146,7 +155,8 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status CLL2NormalizeLayerKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
     return Status{};
@@ -159,7 +169,7 @@
 
     Window window_sum(window);
 
-    switch(_actual_axis)
+    switch (_actual_axis)
     {
         case 0:
         {
@@ -173,8 +183,7 @@
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 1:
@@ -189,8 +198,7 @@
                 add_2D_tensor_argument(idx, _sum, sum_slice);
                 add_2D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
+            } while (window.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(sum_slice));
         }
         break;
         case 2:
@@ -205,8 +213,7 @@
                 add_3D_tensor_argument(idx, _sum, sum_slice);
                 add_3D_tensor_argument(idx, _output, in_slice);
                 enqueue(queue, *this, in_slice, lws_hint());
-            }
-            while(window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
+            } while (window.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(sum_slice));
         }
         break;
         default:

diff --git a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
index edc0585..5c9ab94 100644
--- a/src/core/CL/kernels/CLL2NormalizeLayerKernel.h
+++ b/src/core/CL/kernels/CLL2NormalizeLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLL2NORMALIZELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -70,7 +71,12 @@
      * @param[in]  axis            Axis along which to reduce. Negative values wrap around. Maximum supported actual reduction axis : 2
      * @param[in]  epsilon         Lower bound value for the normalization.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *sum, ICLTensor *output, int axis, float epsilon);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *sum,
+                   ICLTensor              *output,
+                   int                     axis,
+                   float                   epsilon);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLL2NormalizeLayerKernel.
      *
@@ -84,7 +90,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
index dc9d686..e560f1d 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,26 +43,31 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo      *input,
+                          const ITensorInfo      *output,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, indices);
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    PoolingType         pool_type          = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int    pool_size_x = pool_info.pool_size.width;
-    const int    pool_size_y = pool_info.pool_size.height;
+    const int    pool_size_x               = pool_info.pool_size.width;
+    const int    pool_size_y               = pool_info.pool_size.height;
     const Size2D pool_size(pool_size_x, pool_size_y);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                    "Pooling indices only supported for MAX pooling method");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -71,17 +77,20 @@
 }
 } // namespace
 
-CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel()
-    : _input(nullptr), _output(nullptr), _indices(nullptr)
+CLMaxUnpoolingLayerKernel::CLMaxUnpoolingLayerKernel() : _input(nullptr), _output(nullptr), _indices(nullptr)
 {
     _type = CLKernelType::POOL;
 }
 
-void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *indices,
+                                          ICLTensor              *output,
+                                          const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), pool_info, indices->info()));
-    auto padding_info = get_padding_info({ input, indices, output });
+    auto padding_info = get_padding_info({input, indices, output});
 
     _input   = input;
     _output  = output;
@@ -119,7 +128,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayerKernel::validate(const ITensorInfo      *input,
+                                           const ITensorInfo      *indices,
+                                           const ITensorInfo      *output,
+                                           const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, pool_info, indices));
@@ -140,7 +152,6 @@
         add_3D_tensor_argument(idx, _output, slice);
         add_3D_tensor_argument(idx, _indices, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
index 45481d0..eb18a46 100644
--- a/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h

@@ -59,7 +59,11 @@
      * @param[out] output          Destination tensor. Data types supported: Same as @p input.
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *indices,
+                   ICLTensor              *output,
+                   const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMaxUnpoolingLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -72,7 +76,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *input,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *output,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
index ac33468..8632bdf 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -49,7 +50,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -69,15 +70,19 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                ICLTensor              *input,
+                                                ICLTensor              *output,
+                                                float                   epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     _run_in_place = (output == nullptr) || (output == input);
 
-    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(CLMeanStdDevNormalizationKernel::validate(
+        input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), *input->info());
     }
@@ -85,7 +90,8 @@
     _input  = input;
     _output = output;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0));
 
     // Set build options
     CLBuildOptions build_opts;
@@ -134,7 +140,6 @@
         add_2D_tensor_argument_if((!_run_in_place), idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
index a1ba2b9..e02a3c5 100644
--- a/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h
+++ b/src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h

@@ -66,7 +66,10 @@
      * @param[out]     output          (Optional) Destination tensor. It can be nullptr in case of in-place computation. Data type supported: same as @p input
      * @param[in]      epsilon         (Optional) Small float to avoid division by zero in case of zero standard deviation. Defaults to 1e-8.
      */
-    void configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output = nullptr, float epsilon = 1e-8f);
+    void configure(const CLCompileContext &compile_context,
+                   ICLTensor              *input,
+                   ICLTensor              *output  = nullptr,
+                   float                   epsilon = 1e-8f);
     /** Static function to check if given info will lead to a valid configuration of @ref CLMeanStdDevNormalizationKernel
      *
      * @param[in] input   Source tensor info with 2 dimensions. In case of @p output tensor info = nullptr,

diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
index c6c4229..b636c48 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.cpp

@@ -32,6 +32,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -53,7 +54,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -63,7 +64,8 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, NormalizationLayerInfo norm_info)
 {
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output, *input->clone());
@@ -71,9 +73,10 @@
     bool             window_changed = false;
     Window           win;
     const DataLayout data_layout = input->data_layout();
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
+        const unsigned int vec_size_x =
+            adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
         const unsigned int norm_idx             = get_normalization_dimension_index(input->data_layout(), norm_info);
         const bool         is_norm_across_width = norm_idx == 0;
 
@@ -87,15 +90,16 @@
         // The output has 1 right padding because of the vec_size_x.
         // The input has 1 left padding because radius = 1.
         // The input has 2 right padding because of radius = 1 AND because of the extra output padding
-        const unsigned int border_width_left  = is_norm_across_width ? norm_radius : 0;
-        const unsigned int border_width_right = is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
-        const BorderSize   border_size        = BorderSize(0, border_width_right, 0, border_width_left);
+        const unsigned int border_width_left = is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            is_norm_across_width ? norm_radius + (vec_size_x - input->dimension(0) % vec_size_x) : 0;
+        const BorderSize border_size = BorderSize(0, border_width_right, 0, border_width_left);
 
         win = calculate_max_window(*input, Steps(vec_size_x));
 
         // We do not use a Rectangle window for IN_MAP_2D as we clamp the top and bottom accesses inside the kernel, avoiding padding
         // Reads can occur within the valid region of the input
-        if(is_norm_across_width)
+        if (is_norm_across_width)
         {
             AccessWindowStatic input_access(input, -border_size.left, 0, input->dimension(0) + border_size.right, 0);
             window_changed = window_changed || update_window_and_padding(win, input_access);
@@ -112,13 +116,14 @@
     else
     {
         unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / input->element_size(), input->dimension(0));
-        if(norm_info.is_cross_map())
+        if (norm_info.is_cross_map())
         {
             vec_size_x = 1;
         }
         win = calculate_max_window(*input, Steps(vec_size_x));
     }
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -139,10 +144,13 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info)
+void CLNormalizationLayerKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           NormalizationLayerInfo  norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), norm_info));
@@ -152,16 +160,17 @@
     _input  = input;
     _output = output;
 
-    const DataLayout data_layout          = input->info()->data_layout();
-    unsigned int     vec_size_x           = adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
-    int              vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
-    if(norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
+    const DataLayout data_layout = input->info()->data_layout();
+    unsigned int     vec_size_x =
+        adjust_vec_size(max_cl_vector_width / input->info()->element_size(), input->info()->dimension(0));
+    int vec_size_x_leftovers = input->info()->dimension(0) % vec_size_x;
+    if (norm_info.is_cross_map() && data_layout == DataLayout::NHWC)
     {
         vec_size_x           = 1;
         vec_size_x_leftovers = 0;
     }
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
         const unsigned int norm_idx    = get_normalization_dimension_index(data_layout, norm_info);
         _is_norm_across_width          = norm_idx == 0;
@@ -175,9 +184,10 @@
         // The output has 1 right padding because of the vec_size_x.
         // The input has 1 left padding because radius = 1.
         // The input has 2 right padding because of radius = 1 AND the extra output padding
-        const unsigned int border_width_left  = _is_norm_across_width ? norm_radius : 0;
-        const unsigned int border_width_right = _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
-        _border_size                          = BorderSize(0, border_width_right, 0, border_width_left);
+        const unsigned int border_width_left = _is_norm_across_width ? norm_radius : 0;
+        const unsigned int border_width_right =
+            _is_norm_across_width ? norm_radius + (vec_size_x - input->info()->dimension(0) % vec_size_x) : 0;
+        _border_size = BorderSize(0, border_width_right, 0, border_width_left);
     }
 
     const bool is_in_map_2D = (norm_info.type() == NormType::IN_MAP_2D);
@@ -193,12 +203,14 @@
     build_opts.add_option(("-DRADIUS=" + support::cpp11::to_string(norm_info.norm_size() / 2)));
     build_opts.add_option(("-DNUM_SLICES=" + support::cpp11::to_string(input->info()->dimension(2))));
     build_opts.add_option_if(is_in_map_2D, "-DIN_MAP_2D");
-    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()), "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
-    build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC, "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
+    build_opts.add_option_if(norm_info.is_in_map() || (data_layout == DataLayout::NHWC && norm_info.is_cross_map()),
+                             "-DWIDTH_SIZE=" + support::cpp11::to_string(input->info()->dimension(0)));
+    build_opts.add_option_if(norm_info.is_in_map() && data_layout == DataLayout::NHWC,
+                             "-DDIM1_SIZE=" + support::cpp11::to_string(input->info()->dimension(1)));
 
     // Create kernel
     std::string kernel_name;
-    if(norm_info.is_in_map())
+    if (norm_info.is_in_map())
     {
         kernel_name = "normalization_layer_in_map_" + lower_string(string_from_data_layout(data_layout));
     }
@@ -222,16 +234,19 @@
     _config_id += support::cpp11::to_string(input->info()->dimension(0));
     _config_id += "_";
     _config_id += support::cpp11::to_string(input->info()->dimension(1));
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
     }
 }
 
-Status CLNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info)
+Status CLNormalizationLayerKernel::validate(const ITensorInfo     *input,
+                                            const ITensorInfo     *output,
+                                            NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), norm_info).first);
 
     return Status{};
 }
@@ -251,7 +266,6 @@
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/CL/kernels/CLNormalizationLayerKernel.h b/src/core/CL/kernels/CLNormalizationLayerKernel.h
index 739a2ae..5517ba6 100644
--- a/src/core/CL/kernels/CLNormalizationLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizationLayerKernel.h

@@ -63,7 +63,10 @@
      *                             Data layouts supported: same as @p input.
      * @param[in]  norm_info       Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, NormalizationLayerInfo norm_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   NormalizationLayerInfo  norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizationLayerKernel
      *
      * @param[in] input     Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -77,7 +80,7 @@
     static Status validate(const ITensorInfo *input, const ITensorInfo *output, NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
-    void run(const Window &window, cl::CommandQueue &queue) override;
+    void       run(const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:

diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
index 6b0400d..59352a8 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.cpp

@@ -31,32 +31,35 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, std);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(mean->num_dimensions() > 1, "mean and std must be vectors");
 
-    const unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int channel_idx =
+        get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != mean->dimension(0));
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -77,7 +80,8 @@
 
     bool window_changed = update_window_and_padding(win, input_access, output_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -88,12 +92,19 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *mean,
+                                                const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayerKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *mean,
+                                                const ICLTensor        *std)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, mean, std);
@@ -102,7 +113,7 @@
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), *input->info()->clone());
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
@@ -112,9 +123,10 @@
     const DataLayout data_layout = input->info()->data_layout();
 
     // Get number of elements to process per iterations
-    const unsigned int num_elems_processed_per_iteration = (data_layout == DataLayout::NHWC) ? adjust_vec_size(16 / input->info()->element_size(),
-                                                                                                               input->info()->dimension(0)) :
-                                                           (16 / input->info()->element_size());
+    const unsigned int num_elems_processed_per_iteration =
+        (data_layout == DataLayout::NHWC)
+            ? adjust_vec_size(16 / input->info()->element_size(), input->info()->dimension(0))
+            : (16 / input->info()->element_size());
     const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     const DataType     dt          = input->info()->data_type();
 
@@ -122,11 +134,12 @@
     CLBuildOptions build_opts;
     build_opts.add_option(("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)));
     build_opts.add_option(("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)));
-    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
+    build_opts.add_option(("-DVEC_SIZE_LEFTOVER=" +
+                           support::cpp11::to_string(input->info()->dimension(0) % num_elems_processed_per_iteration)));
     build_opts.add_option(("-DNUM_CHANNELS=" + support::cpp11::to_string(input->info()->dimension(channel_idx))));
 
     std::string kernel_name = "normalize_planar_yuv_layer_";
-    if(is_data_type_quantized(dt))
+    if (is_data_type_quantized(dt))
     {
         const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform();
         build_opts.add_option(("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)));
@@ -139,7 +152,7 @@
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
         ICLKernel::configure_internal(win);
@@ -165,12 +178,16 @@
     _config_id += support::cpp11::to_string(input->info()->dimension(2));
 }
 
-Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayerKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *mean,
+                                                 const ITensorInfo *std)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, std));
-    if(input->data_layout() == DataLayout::NCHW)
+    if (input->data_layout() == DataLayout::NCHW)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_nchw(input->clone().get(), output->clone().get()).first);
     }
     return Status{};
 }
@@ -196,7 +213,6 @@
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
index 6db4433..341b404 100644
--- a/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h
+++ b/src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h

@@ -67,7 +67,11 @@
      * @param[in]  std             Standard deviation values tensor. 1 dimension with size equal to the number of input channels.
      *                             Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *mean,
+                   const ICLTensor        *std);
     /** Static function to check if given info will lead to a valid configuration of @ref CLNormalizePlanarYUVLayerKernel
      *
      * @param[in]  input  Source tensor info. 3 lower dimensions represent a single input with dimensions [width, height, channels].
@@ -79,7 +83,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *std);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLPadLayerKernel.cpp b/src/core/CL/kernels/CLPadLayerKernel.cpp
index 53f313c..0ac2850 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPadLayerKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -35,25 +36,29 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const PaddingList &padding,
+                          PixelValue         constant_value,
+                          PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(constant_value);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON((padding.size() < 1) || (padding.size() > input->num_dimensions()));
-    if(mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
+    if (mode == PaddingMode::REFLECT || mode == PaddingMode::SYMMETRIC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 3);
 
         const auto is_reflect = static_cast<unsigned int>(mode == PaddingMode::REFLECT);
-        for(size_t i = 0; i < padding.size(); ++i)
+        for (size_t i = 0; i < padding.size(); ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).first > (input->dimension(i) - is_reflect));
             ARM_COMPUTE_RETURN_ERROR_ON(padding.at(i).second > (input->dimension(i) - is_reflect));
         }
     }
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
@@ -65,41 +70,51 @@
 }
 } // namespace
 
-CLPadLayerKernel::CLPadLayerKernel()
-    : _input(nullptr), _output(nullptr), _4d_enabled(false)
+CLPadLayerKernel::CLPadLayerKernel() : _input(nullptr), _output(nullptr), _4d_enabled(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPadLayerKernel::configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(
+    const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayerKernel::configure(const CLCompileContext &compile_context,
+                                 const ICLTensor        *input,
+                                 ICLTensor              *output,
+                                 const PaddingList      &padding,
+                                 PixelValue              constant_value,
+                                 PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding)));
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), padding, constant_value, mode));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input      = input;
     _output     = output;
     _4d_enabled = (mode == PaddingMode::CONSTANT) && (padding.size() > 3);
 
     // Set build options
-    const DataType    &data_type               = input->info()->data_type();
-    const unsigned int input_width             = input->info()->dimension(0);
-    const unsigned int input_height            = input->info()->dimension(1);
-    const unsigned int input_depth             = input->info()->dimension(2);
-    const unsigned int pad_x_before            = padding.at(0).first;
-    const unsigned int pad_y_before            = padding.size() > 1 ? padding.at(1).first : 0;
-    const unsigned int pad_z_before            = padding.size() > 2 ? padding.at(2).first : 0;
-    const unsigned int vec_size                = adjust_vec_size(std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))), input_width);
-    const unsigned int pad_right_start         = input_width + pad_x_before;
-    const unsigned int pad_x_before_remainder  = pad_x_before % vec_size;
-    const unsigned int vec_size_leftover_write = vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
+    const DataType    &data_type    = input->info()->data_type();
+    const unsigned int input_width  = input->info()->dimension(0);
+    const unsigned int input_height = input->info()->dimension(1);
+    const unsigned int input_depth  = input->info()->dimension(2);
+    const unsigned int pad_x_before = padding.at(0).first;
+    const unsigned int pad_y_before = padding.size() > 1 ? padding.at(1).first : 0;
+    const unsigned int pad_z_before = padding.size() > 2 ? padding.at(2).first : 0;
+    const unsigned int vec_size     = adjust_vec_size(
+            std::min(16U, 32U / static_cast<unsigned int>(element_size_from_data_type(input->info()->data_type()))),
+            input_width);
+    const unsigned int pad_right_start        = input_width + pad_x_before;
+    const unsigned int pad_x_before_remainder = pad_x_before % vec_size;
+    const unsigned int vec_size_leftover_write =
+        vec_size - (ceil_to_multiple(output->info()->dimension(0), vec_size) - output->info()->dimension(0));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
@@ -108,12 +123,12 @@
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width));
     build_opts.add_option("-DPAD_X_BEFORE_REMAINDER=" + support::cpp11::to_string(pad_x_before_remainder));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER_WRITE=" + support::cpp11::to_string(vec_size_leftover_write));
-    if(padding.size() > 1)
+    if (padding.size() > 1)
     {
         build_opts.add_option("-DPAD_Y_BEFORE=" + support::cpp11::to_string(pad_y_before));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height));
 
-        if(padding.size() > 2)
+        if (padding.size() > 2)
         {
             build_opts.add_option("-DPAD_Z_BEFORE=" + support::cpp11::to_string(pad_z_before));
             build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_depth));
@@ -121,23 +136,25 @@
     }
 
     std::string kernel_name = "pad_layer_";
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
         {
             kernel_name += "constant";
 
-            const unsigned int vec_size_leftover_read = vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
+            const unsigned int vec_size_leftover_read =
+                vec_size - (ceil_to_multiple(pad_right_start, vec_size) - pad_right_start);
 
             build_opts.add_option("-DCONST_VAL=" + string_from_pixel_value(constant_value, data_type));
             build_opts.add_option("-DVEC_SIZE_LEFTOVER_READ=" + support::cpp11::to_string(vec_size_leftover_read));
 
-            if(pad_x_before >= vec_size)
+            if (pad_x_before >= vec_size)
             {
                 build_opts.add_option("-DTHREADS_TO_SKIP_BEFORE=" + support::cpp11::to_string(pad_x_before / vec_size));
-                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" + support::cpp11::to_string(pad_right_start / vec_size));
+                build_opts.add_option("-DTHREADS_TO_SKIP_AFTER=" +
+                                      support::cpp11::to_string(pad_right_start / vec_size));
             }
-            if(_4d_enabled)
+            if (_4d_enabled)
             {
                 build_opts.add_option("-DPAD_W_BEFORE=" + support::cpp11::to_string(padding.at(3).first));
                 build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(input->info()->dimension(3)));
@@ -154,14 +171,17 @@
 
             const unsigned int pad_x_after_remainder = pad_right_start % vec_size;
             const unsigned int after_pad_fact_x      = (2 * input_width + pad_x_before) - is_reflect;
-            const unsigned int output_last_x         = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
+            const unsigned int output_last_x = ceil_to_multiple(pad_right_start + padding.at(0).second, vec_size);
 
             build_opts.add_option("-DIS_REFLECT=" + support::cpp11::to_string(is_reflect));
             build_opts.add_option("-DPAD_X_AFTER_REMAINDER=" + support::cpp11::to_string(pad_x_after_remainder));
-            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
-            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" + support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_BEFORE_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_before_remainder + is_reflect) % vec_size));
+            build_opts.add_option("-DPAD_X_AFTER_REMAINDER_REFL=" +
+                                  support::cpp11::to_string((pad_x_after_remainder - is_reflect) % vec_size));
             build_opts.add_option("-DAFTER_PAD_FACT_X=" + support::cpp11::to_string(after_pad_fact_x));
-            build_opts.add_option_if(after_pad_fact_x < output_last_x, "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
+            build_opts.add_option_if(after_pad_fact_x < output_last_x,
+                                     "-DAFTER_PAD_REM=" + support::cpp11::to_string(after_pad_fact_x % vec_size));
 
             break;
         }
@@ -179,7 +199,11 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayerKernel::validate(const ITensorInfo *input,
+                                  const ITensorInfo *output,
+                                  const PaddingList &padding,
+                                  PixelValue         constant_value,
+                                  PaddingMode        mode)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, constant_value, mode));
     return Status{};
@@ -197,13 +221,12 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
-        if(_4d_enabled)
+        if (_4d_enabled)
         {
             add_argument<unsigned int>(idx, batch++);
         }
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLPadLayerKernel.h b/src/core/CL/kernels/CLPadLayerKernel.h
index 90af337..dca121b 100644
--- a/src/core/CL/kernels/CLPadLayerKernel.h
+++ b/src/core/CL/kernels/CLPadLayerKernel.h

@@ -56,7 +56,11 @@
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const ICLTensor   *input,
+                   ICLTensor         *output,
+                   const PaddingList &padding,
+                   PixelValue         constant_value = PixelValue(),
+                   PaddingMode        mode           = PaddingMode::CONSTANT);
     /** Set the input and output tensor.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -68,8 +72,12 @@
      * @param[in]  mode            (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                             or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value = PixelValue(),
-                   PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const PaddingList      &padding,
+                   PixelValue              constant_value = PixelValue(),
+                   PaddingMode             mode           = PaddingMode::CONSTANT);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPadLayerKernel
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -80,7 +88,11 @@
      * @param[in] mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT,
      *                            or reflect the input, either including the border values (SYMMETRIC) or not (REFLECT).
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value = PixelValue(), PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           PixelValue         constant_value = PixelValue(),
+                           PaddingMode        mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
index bf1b874..7dcdf1d 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.cpp

@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,10 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo       *input1,
+                          const ITensorInfo       *input2,
+                          const ITensorInfo       *output,
+                          const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -51,10 +54,10 @@
 
     // Check variances
     const int var_size = info.variances().size();
-    if(var_size > 1)
+    if (var_size > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
-        for(int i = 0; i < var_size; ++i)
+        for (int i = 0; i < var_size; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
         }
@@ -62,17 +65,19 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
 
-    if(!info.max_sizes().empty())
+    if (!info.max_sizes().empty())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+                                        "Max and min sizes dimensions should match");
     }
 
-    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+                                        "Max size should be greater than min size");
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
     }
@@ -80,7 +85,11 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, const PriorBoxLayerInfo &info, int num_priors)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *input1,
+                                                        const ITensorInfo       *input2,
+                                                        ITensorInfo             *output,
+                                                        const PriorBoxLayerInfo &info,
+                                                        int                      num_priors)
 {
     ARM_COMPUTE_UNUSED(input2);
     // Output tensor auto initialization if not yet initialized
@@ -88,10 +97,11 @@
     auto_init_if_empty(*output, output_shape, 1, input1->data_type());
 
     const unsigned int     num_elems_processed_per_iteration = 4 * num_priors;
-    Window                 win                               = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
+    Window                 win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
     bool                   window_changed = update_window_and_padding(win, output_access);
-    Status                 err            = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status                 err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -102,13 +112,25 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLPriorBoxLayerKernel::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info, min, max, aspect_ratios);
 }
 
-void CLPriorBoxLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min,
-                                      cl::Buffer *max, cl::Buffer *aspect_ratios)
+void CLPriorBoxLayerKernel::configure(const CLCompileContext  &compile_context,
+                                      const ICLTensor         *input1,
+                                      const ICLTensor         *input2,
+                                      ICLTensor               *output,
+                                      const PriorBoxLayerInfo &info,
+                                      cl::Buffer              *min,
+                                      cl::Buffer              *max,
+                                      cl::Buffer              *aspect_ratios)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -135,7 +157,7 @@
 
     int img_width  = info.img_size().x;
     int img_height = info.img_size().y;
-    if(img_width == 0 || img_height == 0)
+    if (img_width == 0 || img_height == 0)
     {
         img_width  = input2->info()->dimension(width_idx);
         img_height = input2->info()->dimension(height_idx);
@@ -143,7 +165,7 @@
 
     float step_x = info.steps()[0];
     float step_y = info.steps()[0];
-    if(step_x == 0.f || step_y == 0.f)
+    if (step_x == 0.f || step_y == 0.f)
     {
         step_x = static_cast<float>(img_width) / layer_width;
         step_y = static_cast<float>(img_height) / layer_height;
@@ -162,18 +184,20 @@
     build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(info.offset()));
     build_opts.add_option_if(info.clip(), "-DIN_PLACE");
 
-    if(info.variances().size() > 1)
+    if (info.variances().size() > 1)
     {
-        for(unsigned int i = 0; i < info.variances().size(); ++i)
+        for (unsigned int i = 0; i < info.variances().size(); ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(i)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(i)));
         }
     }
     else
     {
-        for(unsigned int i = 0; i < 4; ++i)
+        for (unsigned int i = 0; i < 4; ++i)
         {
-            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(info.variances().at(0)));
+            build_opts.add_option("-DVARIANCE_" + support::cpp11::to_string(i) + "=" +
+                                  support::cpp11::to_string(info.variances().at(0)));
         }
     }
 
@@ -194,13 +218,17 @@
     ICLKernel::configure_internal(win_config.second);
 }
 
-Status CLPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayerKernel::validate(const ITensorInfo       *input1,
+                                       const ITensorInfo       *input2,
+                                       const ITensorInfo       *output,
+                                       const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
     const int num_priors = info.aspect_ratios().size() * info.min_sizes().size() + info.max_sizes().size();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(), output->clone().get(), info, num_priors)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input1->clone().get(), input2->clone().get(),
+                                                              output->clone().get(), info, num_priors)
+                                    .first);
 
     return Status{};
 }
@@ -211,8 +239,9 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     queue.enqueueWriteBuffer(*_min, CL_TRUE, 0, _info.min_sizes().size() * sizeof(float), _info.min_sizes().data());
-    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float), _info.aspect_ratios().data());
-    if(!_info.max_sizes().empty())
+    queue.enqueueWriteBuffer(*_aspect_ratios, CL_TRUE, 0, _info.aspect_ratios().size() * sizeof(float),
+                             _info.aspect_ratios().data());
+    if (!_info.max_sizes().empty())
     {
         queue.enqueueWriteBuffer(*_max, CL_TRUE, 0, _info.max_sizes().size() * sizeof(float), _info.max_sizes().data());
     }

diff --git a/src/core/CL/kernels/CLPriorBoxLayerKernel.h b/src/core/CL/kernels/CLPriorBoxLayerKernel.h
index 6c369a7..a50e0c5 100644
--- a/src/core/CL/kernels/CLPriorBoxLayerKernel.h
+++ b/src/core/CL/kernels/CLPriorBoxLayerKernel.h

@@ -57,7 +57,13 @@
      * @param[in]  max           Maximum prior box values
      * @param[in]  aspect_ratios Aspect ratio values
      */
-    void configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max, cl::Buffer *aspect_ratios);
+    void configure(const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -69,8 +75,14 @@
      * @param[in]  max             Maximum prior box values
      * @param[in]  aspect_ratios   Aspect ratio values
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info, cl::Buffer *min, cl::Buffer *max,
-                   cl::Buffer *aspect_ratios);
+    void configure(const CLCompileContext  &compile_context,
+                   const ICLTensor         *input1,
+                   const ICLTensor         *input2,
+                   ICLTensor               *output,
+                   const PriorBoxLayerInfo &info,
+                   cl::Buffer              *min,
+                   cl::Buffer              *max,
+                   cl::Buffer              *aspect_ratios);
     /** Static function to check if given info will lead to a valid configuration of @ref CLPriorBoxLayerKernel
      *
      * @param[in] input1 First source tensor info. Data types supported: F32. Data layouts supported: NCHW/NHWC.
@@ -80,14 +92,17 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    const ICLTensor *_input1;
-    const ICLTensor *_input2;
+    const ICLTensor  *_input1;
+    const ICLTensor  *_input2;
     ICLTensor        *_output;
     PriorBoxLayerInfo _info;
     int               _num_priors;

diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
index bd573e5..731fcb8 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.cpp

@@ -22,10 +22,12 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -49,14 +51,19 @@
     const uint32_t temp_num_elems_processed_per_iteration = max_cl_vector_width / input->element_size();
     /* If width is less then step, then make step same as width to avoid global size being step instead of actual width. */
     /* Or we should fix in arm_compute::enqueue() or arm_compute::calculate_max_window(). */
-    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration) ? input->dimension(0) : temp_num_elems_processed_per_iteration;
+    const uint32_t num_elems_processed_per_iteration = (input->dimension(0) < temp_num_elems_processed_per_iteration)
+                                                           ? input->dimension(0)
+                                                           : temp_num_elems_processed_per_iteration;
 
     // This kernel doesn't need padding
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
     return std::make_pair(Status{}, win);
 }
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const ITensorInfo *weight,
+                          const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weight, bias, output);
 
@@ -72,7 +79,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -87,10 +94,14 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const CLCompileContext &compile_context,
+                                                const ICLTensor        *input,
+                                                ICLTensor              *output,
+                                                const ICLTensor        *weight,
+                                                const ICLTensor        *bias)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
-    auto padding_info = get_padding_info({ input, weight, bias, output });
+    auto padding_info = get_padding_info({input, weight, bias, output});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), weight->info(), bias->info()));
 
@@ -104,7 +115,8 @@
     int32_t                       output_multiplier{};
     int32_t                       output_shift{};
     const UniformQuantizationInfo quan_info = _weight->info()->quantization_info().uniform();
-    const Status                  status    = quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
+    const Status                  status =
+        quantization::calculate_quantized_multiplier(quan_info.scale, &output_multiplier, &output_shift);
     output_shift *= -1;
 
     // Set build options
@@ -114,8 +126,12 @@
     build_opts.add_option("-DWIDTH=" + support::cpp11::to_string(input->info()->dimension(0)));
     build_opts.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
     build_opts.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
-    build_opts.add_option("-DMIN_BOUND=" + support::cpp11::to_string(std::get<0>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
-    build_opts.add_option("-DMAX_BOUND=" + support::cpp11::to_string(std::get<1>(quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMIN_BOUND=" +
+                          support::cpp11::to_string(std::get<0>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
+    build_opts.add_option("-DMAX_BOUND=" +
+                          support::cpp11::to_string(std::get<1>(
+                              quantization::get_min_max_values_from_quantized_data_type(input->info()->data_type()))));
 
     // Create kernel
     _kernel = create_kernel(compile_context, "qlstm_layer_normalization", build_opts.options());
@@ -135,12 +151,18 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias)
+void CLQLSTMLayerNormalizationKernel::configure(const ICLTensor *input,
+                                                ICLTensor       *output,
+                                                const ICLTensor *weight,
+                                                const ICLTensor *bias)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, weight, bias);
 }
 
-Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status CLQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *weight,
+                                                 const ITensorInfo *bias)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, weight, bias));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get()).first);
@@ -171,7 +193,6 @@
         add_2D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
index 31085c3..ba912e1 100644
--- a/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h
+++ b/src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h

@@ -63,7 +63,11 @@
      * @param[in]  weight          Weight tensor. Data types supported: Same as @p input.
      * @param[in]  bias            Bias tensor. Data types supported: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *weight, const ICLTensor *bias);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *weight,
+                   const ICLTensor        *bias);
     /** Static function to check if given info will lead to a valid configuration of @ref CLQLSTMLayerNormalizationKernel
      *
      * @param[in] input  Source tensor info with 2 dimensions. Data types supported: QSYMM16.
@@ -73,7 +77,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
index 69a6fa5..c97910e 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,24 +43,29 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          ITensorInfo               *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+                                                           output->tensor_shape());
     }
 
-    if(is_data_type_quantized_asymmetric(input->data_type()))
+    if (is_data_type_quantized_asymmetric(input->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -82,12 +88,19 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLROIAlignLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayerKernel::configure(const CLCompileContext    &compile_context,
+                                      const ICLTensor           *input,
+                                      const ICLTensor           *rois,
+                                      ICLTensor                 *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
@@ -97,7 +110,7 @@
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
     output->info()->set_data_layout(input->info()->data_layout());
 
-    auto padding_info = get_padding_info({ input, rois, output });
+    auto padding_info = get_padding_info({input, rois, output});
 
     _input     = input;
     _output    = output;
@@ -111,16 +124,23 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DDATA_SIZE=" + get_data_size_from_data_type(input->info()->data_type()));
-    build_opts.add_option("-DMAX_DIM_X=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH))));
-    build_opts.add_option("-DMAX_DIM_Y=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
-    build_opts.add_option("-DMAX_DIM_Z=" + support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
+    build_opts.add_option("-DMAX_DIM_X=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::WIDTH))));
+    build_opts.add_option("-DMAX_DIM_Y=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::HEIGHT))));
+    build_opts.add_option("-DMAX_DIM_Z=" +
+                          support::cpp11::to_string(_input->info()->dimension(get_data_layout_dimension_index(
+                              input->info()->data_layout(), DataLayoutDimension::CHANNEL))));
     build_opts.add_option("-DPOOLED_DIM_X=" + support::cpp11::to_string(pool_info.pooled_width()));
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + float_to_string_with_full_precision(pool_info.spatial_scale()));
     build_opts.add_option_if(input->info()->data_layout() == DataLayout::NHWC, "-DNHWC");
-    build_opts.add_option_if(pool_info.sampling_ratio() > 0, "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
+    build_opts.add_option_if(pool_info.sampling_ratio() > 0,
+                             "-DSAMPLING_RATIO=" + support::cpp11::to_string(pool_info.sampling_ratio()));
 
-    if(is_qasymm)
+    if (is_qasymm)
     {
         const UniformQuantizationInfo iq_info    = input->info()->quantization_info().uniform();
         const UniformQuantizationInfo roisq_info = rois->info()->quantization_info().uniform();
@@ -144,7 +164,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayerKernel::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *rois,
+                                       ITensorInfo               *output,
+                                       const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};

diff --git a/src/core/CL/kernels/CLROIAlignLayerKernel.h b/src/core/CL/kernels/CLROIAlignLayerKernel.h
index 5284a59..2e84e5d 100644
--- a/src/core/CL/kernels/CLROIAlignLayerKernel.h
+++ b/src/core/CL/kernels/CLROIAlignLayerKernel.h

@@ -61,7 +61,8 @@
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -77,7 +78,11 @@
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   ICLTensor                 *output,
+                   const ROIPoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLROIAlignLayerKernel
      *
      * @param[in] input     Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -93,7 +98,10 @@
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue);

diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
index f6933c6..1b2c414 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -48,7 +49,10 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status CLROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayerKernel::validate(const ITensorInfo         *input,
+                                         const ITensorInfo         *rois,
+                                         const ITensorInfo         *output,
+                                         const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
 
@@ -61,10 +65,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+                                    (output->dimension(1) != pool_info.pooled_height()));
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
         ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
     }
@@ -72,20 +77,30 @@
     return Status{};
 }
 
-void CLROIPoolingLayerKernel::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        ICLTensor                 *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayerKernel::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input,
+                                        const ICLTensor           *rois,
+                                        const ICLTensor           *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
-    ARM_COMPUTE_ERROR_THROW_ON(CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CLROIPoolingLayerKernel::validate(input->info(), rois->info(), output->info(), pool_info));
 
-    auto padding_info = get_padding_info({ input, rois, output });
+    auto padding_info = get_padding_info({input, rois, output});
 
     // Output auto initialization if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
-    auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+                             rois->info()->dimension(1));
+    auto_init_if_empty(*(output->info()), output_shape, 1, input->info()->data_type(),
+                       output->info()->quantization_info());
 
     // Set instance variables
     _input     = input;
@@ -107,11 +122,12 @@
     build_opts.add_option("-DPOOLED_DIM_Y=" + support::cpp11::to_string(pool_info.pooled_height()));
     build_opts.add_option("-DSPATIAL_SCALE=" + support::cpp11::to_string(pool_info.spatial_scale()));
 
-    if(is_qasymm)
+    if (is_qasymm)
     {
         // Determine quantization info scale, offset
         UniformQuantizationInfo uqinfo = UniformQuantizationInfo();
-        uqinfo                         = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(), _output->info()->quantization_info().uniform());
+        uqinfo = compute_requantization_scale_offset(_input->info()->quantization_info().uniform(),
+                                                     _output->info()->quantization_info().uniform());
         build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(uqinfo.offset));
         build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(uqinfo.scale));
 

diff --git a/src/core/CL/kernels/CLROIPoolingLayerKernel.h b/src/core/CL/kernels/CLROIPoolingLayerKernel.h
index 7b7b457..80bfb63 100644
--- a/src/core/CL/kernels/CLROIPoolingLayerKernel.h
+++ b/src/core/CL/kernels/CLROIPoolingLayerKernel.h

@@ -59,7 +59,8 @@
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
     /** Set the input and output tensors.
      *
      * @param[in]  compile_context The compile context to be used.
@@ -74,7 +75,11 @@
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info);
+    void configure(const CLCompileContext    &compile_context,
+                   const ICLTensor           *input,
+                   const ICLTensor           *rois,
+                   const ICLTensor           *output,
+                   const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -92,7 +97,10 @@
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois array.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     const ICLTensor    *_input;

diff --git a/src/core/CL/kernels/CLRangeKernel.cpp b/src/core/CL/kernels/CLRangeKernel.cpp
index a06c2ee..622f621 100644
--- a/src/core/CL/kernels/CLRangeKernel.cpp
+++ b/src/core/CL/kernels/CLRangeKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -42,11 +43,8 @@
 Status validate_arguments(const ITensorInfo *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8,
-                                                         DataType::U16, DataType::S16,
-                                                         DataType::U32, DataType::S32,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::U16, DataType::S16, DataType::U32, DataType::S32,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(output);
 
@@ -56,19 +54,22 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()), "start value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()), "end value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()), "step value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output->data_type(), output->quantization_info()),
+                                    "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output->data_type(), output->quantization_info()),
+                                    "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output->data_type(), output->quantization_info()),
+                                    "step value is outside the range of the data type");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->num_dimensions() != 1, "Output has to be a 1-D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+                                    "Output tensor size is incorrect");
 
     return Status{};
 }
 } // namespace
 
-CLRangeKernel::CLRangeKernel()
-    : _start(0), _end(1), _step(1), _output(nullptr)
+CLRangeKernel::CLRangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -78,16 +79,18 @@
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRangeKernel::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRangeKernel::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(output->info(), start, end, step));
 
     // Configure kernel window
-    unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
-    Window       win                               = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
+    unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / output->info()->element_size(), output->info()->dimension(0));
+    Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration));
 
-    auto padding_info = get_padding_info({ output });
+    auto padding_info = get_padding_info({output});
 
     _start  = start;
     _end    = end;
@@ -100,10 +103,11 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type()));
     build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(output->info()->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DSTART=" + support::cpp11::to_string(start));
     build_opts.add_option("-DSTEP=" + support::cpp11::to_string(step));
-    if(is_data_type_quantized_asymmetric(output->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(output->info()->data_type()))
     {
         const UniformQuantizationInfo qinfo = output->info()->quantization_info().uniform();
         build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(qinfo.offset));

diff --git a/src/core/CL/kernels/CLRangeKernel.h b/src/core/CL/kernels/CLRangeKernel.h
index 1b94a09..65251a1 100644
--- a/src/core/CL/kernels/CLRangeKernel.h
+++ b/src/core/CL/kernels/CLRangeKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLRANGEKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.cpp b/src/core/CL/kernels/CLReductionOperationKernel.cpp
index e5cfb99..70875a2 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.cpp
+++ b/src/core/CL/kernels/CLReductionOperationKernel.cpp

@@ -28,15 +28,15 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -47,23 +47,28 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    if(input->num_channels() == 1)
+    if (input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                             DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON(axis == 0);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8, "Not supported reduction operation for QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op == ReductionOperation::SUM_SQUARE && input->data_type() == DataType::QASYMM8,
+                                    "Not supported reduction operation for QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
-    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) && (input->data_type() != DataType::QASYMM8)
-                                && (input->data_type() != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN), "Not supported reduction operation, use CLArgMinMaxLayer");
+    ARM_COMPUTE_RETURN_ERROR_ON((op == ReductionOperation::MEAN_SUM) && (axis == 0) && (input->dimension(0) == 0) &&
+                                (input->data_type() != DataType::QASYMM8) &&
+                                (input->data_type() != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN),
+                                    "Not supported reduction operation, use CLArgMinMaxLayer");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -79,33 +84,42 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLReductionOperationKernel::configure(const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const ICLTensor   *input,
+                                           ICLTensor         *output,
+                                           unsigned int       axis,
+                                           ReductionOperation op)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op);
 }
 
-void CLReductionOperationKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op)
+void CLReductionOperationKernel::configure(const CLCompileContext &compile_context,
+                                           const ICLTensor        *input,
+                                           ICLTensor              *output,
+                                           unsigned int            axis,
+                                           ReductionOperation      op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), axis, op));
 
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input          = input;
     _output         = output;
     _reduction_axis = axis;
     _op             = op;
 
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, true);
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).reset_padding().set_is_resizable(true));
 
     // Set build options
     CLBuildOptions build_opts;
     DataType       data_type = input->info()->data_type();
     std::string    data_type_promoted{};
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         data_type_promoted = "int";
     }
@@ -130,10 +144,14 @@
     build_opts.add_option_if(op == ReductionOperation::PROD, "-DPROD");
     build_opts.add_option_if(op == ReductionOperation::MIN, "-DMIN");
     build_opts.add_option_if(op == ReductionOperation::MAX, "-DMAX");
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DOFFSET=" + support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
-    build_opts.add_option_if(is_data_type_quantized(data_type), "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
+    build_opts.add_option_if(is_data_type_quantized(data_type),
+                             "-DOFFSET=" +
+                                 support::cpp11::to_string(input->info()->quantization_info().uniform().offset));
+    build_opts.add_option_if(
+        is_data_type_quantized(data_type),
+        "-DSCALE=" + float_to_string_with_full_precision(input->info()->quantization_info().uniform().scale));
 
-    switch(op)
+    switch (op)
     {
         case ReductionOperation::SUM_SQUARE:
             build_opts.add_option(("-DOPERATION=square_sum"));
@@ -159,7 +177,7 @@
     std::string kernel_axis_name;
     const bool  is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
 
-    switch(axis)
+    switch (axis)
     {
         case 0:
         {
@@ -187,13 +205,17 @@
 
     // Configure kernel window
     Window win = calculate_max_window(*input->info(), Steps(vec_size));
-    win.set(Window::DimX, Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
+    win.set(Window::DimX,
+            Window::Dimension(win.x().start(), win.x().end() * _input->info()->num_channels(), win.x().step()));
     ICLKernel::configure_internal(win);
 
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status CLReductionOperationKernel::validate(const ITensorInfo *input,
+                                            const ITensorInfo *output,
+                                            unsigned int       axis,
+                                            ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
     return Status{};
@@ -205,18 +227,19 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     const bool is_serial_op = needs_serialized_reduction(_op, _input->info()->data_type(), _reduction_axis);
-    switch(_reduction_axis)
+    switch (_reduction_axis)
     {
         case 0:
         {
             // We use parallel reduction only in non quantized types
-            if(is_serial_op)
+            if (is_serial_op)
             {
                 // Get first input and output slices
-                Window window_in{ window };
-                window_in.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
+                Window window_in{window};
+                window_in.set(Window::DimX,
+                              Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0)));
 
-                Window out_window{ window };
+                Window out_window{window};
                 out_window.set(Window::DimX, Window::Dimension(0, 0, 0));
 
                 Window in_slice  = window_in.first_slice_window_1D();
@@ -228,8 +251,7 @@
                     add_1D_tensor_argument(idx, _input, in_slice);
                     add_1D_tensor_argument(idx, _output, out_slice);
                     enqueue(queue, *this, in_slice);
-                }
-                while(window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
+                } while (window_in.slide_window_slice_1D(in_slice) && out_window.slide_window_slice_1D(out_slice));
             }
             else
             {
@@ -251,8 +273,9 @@
         case 1:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
+            Window window_in{window};
+            window_in.set(Window::DimY,
+                          Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1)));
             Window in_slice  = window_in.first_slice_window_2D();
             Window out_slice = window.first_slice_window_2D();
 
@@ -262,15 +285,15 @@
                 add_2D_tensor_argument(idx, _input, in_slice);
                 add_2D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
+            } while (window_in.slide_window_slice_2D(in_slice) && window.slide_window_slice_2D(out_slice));
         }
         break;
         case 2:
         {
             // Get first input and output slices
-            Window window_in{ window };
-            window_in.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
+            Window window_in{window};
+            window_in.set(Window::DimZ,
+                          Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2)));
             Window in_slice  = window_in.first_slice_window_3D();
             Window out_slice = window.first_slice_window_3D();
 
@@ -280,14 +303,13 @@
                 add_3D_tensor_argument(idx, _input, in_slice);
                 add_3D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
+            } while (window_in.slide_window_slice_3D(in_slice) && window.slide_window_slice_3D(out_slice));
         }
         break;
         case 3:
         {
             // Get first input and output slices
-            Window window_in{ window };
+            Window window_in{window};
             window_in.set(3, Window::Dimension(0, 1, 1));
             Window in_slice  = window_in.first_slice_window_4D();
             Window out_slice = window.first_slice_window_4D();
@@ -298,8 +320,7 @@
                 add_4D_tensor_argument(idx, _input, in_slice);
                 add_4D_tensor_argument(idx, _output, out_slice);
                 enqueue(queue, *this, in_slice);
-            }
-            while(window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
+            } while (window_in.slide_window_slice_4D(in_slice) && window.slide_window_slice_4D(out_slice));
         }
         break;
         default:

diff --git a/src/core/CL/kernels/CLReductionOperationKernel.h b/src/core/CL/kernels/CLReductionOperationKernel.h
index b456378..2f94b2a 100644
--- a/src/core/CL/kernels/CLReductionOperationKernel.h
+++ b/src/core/CL/kernels/CLReductionOperationKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREDUCTIONOPERATIONKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -67,7 +68,11 @@
      * @param[in]  axis            Axis along which to reduce. Supported reduction axis : 0,1,2,3
      * @param[in]  op              Reduction operation to perform. Operations supported: MEAN_SUM, PROD, SUM_SQUARE, SUM, MIN, MAX
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   unsigned int            axis,
+                   ReductionOperation      op);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReductionOperationKernel.
      *
@@ -79,7 +84,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLReorgLayerKernel.cpp b/src/core/CL/kernels/CLReorgLayerKernel.cpp
index 3c74e80..9fd2194 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.cpp
+++ b/src/core/CL/kernels/CLReorgLayerKernel.cpp

@@ -28,9 +28,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -51,13 +52,16 @@
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+                                    "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+                                    "The height of the input tensor must be a multiple of stride");
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        const TensorInfo tensor_info_output =
+            output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -66,8 +70,7 @@
 }
 } // namespace
 
-CLReorgLayerKernel::CLReorgLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLReorgLayerKernel::CLReorgLayerKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -77,17 +80,22 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   ICLTensor              *output,
+                                   int32_t                 stride)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), stride));
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     _input  = input;
     _output = output;
 
-    std::string  kernel_name = std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
-    const size_t idx_channel = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+    std::string kernel_name =
+        std::string("reorg_layer_") + lower_string(string_from_data_layout(input->info()->data_layout()));
+    const size_t idx_channel =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -98,7 +106,9 @@
 
     // Configure window
     // auto inizialize the output tensor if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(
+                           misc::shape_calculator::compute_reorg_output_shape(*input->info(), stride)));
 
     Window win = calculate_max_window(*output->info(), Steps());
 
@@ -119,7 +129,9 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input, const arm_compute::ITensorInfo *output, int32_t stride)
+Status CLReorgLayerKernel::validate(const arm_compute::ITensorInfo *input,
+                                    const arm_compute::ITensorInfo *output,
+                                    int32_t                         stride)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, stride));
 
@@ -139,7 +151,6 @@
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLReorgLayerKernel.h b/src/core/CL/kernels/CLReorgLayerKernel.h
index 455a617..f335071 100644
--- a/src/core/CL/kernels/CLReorgLayerKernel.h
+++ b/src/core/CL/kernels/CLReorgLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLREORGLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute

diff --git a/src/core/CL/kernels/CLReverseKernel.cpp b/src/core/CL/kernels/CLReverseKernel.cpp
index 0d70ff4..79a0f03 100644
--- a/src/core/CL/kernels/CLReverseKernel.cpp
+++ b/src/core/CL/kernels/CLReverseKernel.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -49,7 +50,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -60,8 +61,7 @@
 }
 } // namespace
 
-CLReverseKernel::CLReverseKernel()
-    : _input(nullptr), _output(nullptr), _axis(nullptr)
+CLReverseKernel::CLReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -71,10 +71,13 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
 }
 
-void CLReverseKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverseKernel::configure(const CLCompileContext &compile_context,
+                                const ICLTensor        *input,
+                                ICLTensor              *output,
+                                const ICLTensor        *axis)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, axis);
-    auto padding_info = get_padding_info({ input, output, axis });
+    auto padding_info = get_padding_info({input, output, axis});
 
     _input  = input;
     _output = output;
@@ -138,7 +141,6 @@
         add_1D_tensor_argument(idx, _axis, axis_slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLReverseKernel.h b/src/core/CL/kernels/CLReverseKernel.h
index 4a21e4f..fbd99dc 100644
--- a/src/core/CL/kernels/CLReverseKernel.h
+++ b/src/core/CL/kernels/CLReverseKernel.h

@@ -60,7 +60,10 @@
      * @param[out] output          Output tensor. Data type supported: Same as @p input
      * @param[in]  axis            Axis tensor. Contains the indices of the dimensions to reverse. Data type supported: U32
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const ICLTensor        *axis);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLReverseKernel
      *

diff --git a/src/core/CL/kernels/CLSelectKernel.cpp b/src/core/CL/kernels/CLSelectKernel.cpp
index c0e014e..703c64d 100644
--- a/src/core/CL/kernels/CLSelectKernel.cpp
+++ b/src/core/CL/kernels/CLSelectKernel.cpp

@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -51,9 +51,11 @@
 
     const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+                                ((c->tensor_shape().num_dimensions() > 1) ||
+                                 (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -63,13 +65,16 @@
 }
 } // namespace
 
-CLSelectKernel::CLSelectKernel()
-    : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
+CLSelectKernel::CLSelectKernel() : _c(nullptr), _x(nullptr), _y(nullptr), _output(nullptr), _has_same_rank(false)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSelectKernel::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelectKernel::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *c,
+                               const ICLTensor        *x,
+                               const ICLTensor        *y,
+                               ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(c, x, y, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(c->info(), x->info(), y->info(), output->info()));
@@ -80,7 +85,7 @@
     _output        = output;
     _has_same_rank = (c->info()->tensor_shape().num_dimensions() == x->info()->tensor_shape().num_dimensions());
 
-    auto               padding_info         = get_padding_info({ c, x, y, output });
+    auto               padding_info         = get_padding_info({c, x, y, output});
     const unsigned int vec_size_x           = adjust_vec_size(16 / x->info()->element_size(), x->info()->dimension(0));
     const int          vec_size_x_leftovers = output->info()->dimension(0) % vec_size_x;
 
@@ -92,14 +97,14 @@
 
     // Create kernel
     std::string kernel_name = "select";
-    if(_has_same_rank)
+    if (_has_same_rank)
     {
         kernel_name += "_same_rank";
     }
     else
     {
         const bool is_input_rank_greater_than_two = x->info()->tensor_shape().num_dimensions() > 2;
-        if(is_input_rank_greater_than_two)
+        if (is_input_rank_greater_than_two)
         {
             const size_t width      = x->info()->tensor_shape().x();
             const size_t height     = x->info()->tensor_shape().y();
@@ -128,7 +133,8 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+CLSelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(c, x, y, output));
     return Status{};
@@ -142,7 +148,7 @@
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
-    if(!_has_same_rank)
+    if (!_has_same_rank)
     {
         Window vector_slice = window.first_slice_window_1D();
         vector_slice.set(Window::DimX, Window::Dimension(0, 0, 0));
@@ -153,7 +159,7 @@
     do
     {
         unsigned int idx = _has_same_rank ? 0 : num_arguments_per_1D_tensor();
-        if(_has_same_rank)
+        if (_has_same_rank)
         {
             add_3D_tensor_argument(idx, _c, slice);
         }
@@ -162,7 +168,6 @@
         add_3D_tensor_argument(idx, _output, slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLSelectKernel.h b/src/core/CL/kernels/CLSelectKernel.h
index b8c10cd..c4256fd 100644
--- a/src/core/CL/kernels/CLSelectKernel.h
+++ b/src/core/CL/kernels/CLSelectKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSELECTKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,11 @@
      * @param[out] y               Second input tensor. Data types supported: Same as @p x
      * @param[in]  output          Output tensor. Data types supported: Same as @p x.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *c,
+                   const ICLTensor        *x,
+                   const ICLTensor        *y,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSelectKernel
      *
      * @param[in] c      Condition input tensor. Data types supported: U8.

diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index 3632ae2..f4c0839 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,19 +39,22 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *block_info,
+                          const ITensorInfo *paddings,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
     ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -61,7 +65,11 @@
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const Size2D      &padding_left,
+                                 const Size2D      &padding_right,
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -70,9 +78,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+            input, block_shape_x, block_shape_y, padding_left, padding_right);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -88,16 +97,24 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const ICLTensor *block_shape,
+                                          const ICLTensor *paddings,
+                                          ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const ICLTensor        *block_shape,
+                                          const ICLTensor        *paddings,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
-    auto padding_info = get_padding_info({ input, block_shape, paddings, output });
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+    auto padding_info = get_padding_info({input, block_shape, paddings, output});
 
     _input       = input;
     _block_shape = block_shape;
@@ -111,14 +128,17 @@
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
-    _kernel = create_kernel(compile_context, "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -126,22 +146,34 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                                          ICLTensor *output)
+void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input,
+                                          const int        block_shape_x,
+                                          const int        block_shape_y,
+                                          const Size2D    &padding_left,
+                                          const Size2D    &padding_right,
+                                          ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                          const Size2D &padding_right,
-                                          ICLTensor    *output)
+void CLSpaceToBatchLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          const int               block_shape_x,
+                                          const int               block_shape_y,
+                                          const Size2D           &padding_left,
+                                          const Size2D           &padding_right,
+                                          ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+        input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+                                                         padding_right, output->info()));
 
     _input  = input;
     _output = output;
@@ -153,7 +185,8 @@
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->info()->data_type())));
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
@@ -166,22 +199,32 @@
     build_opts.add_option("-DPAD_RIGHT_X=" + support::cpp11::to_string(padding_right.x()));
     build_opts.add_option("-DPAD_LEFT_Y=" + support::cpp11::to_string(padding_left.y()));
     build_opts.add_option("-DPAD_RIGHT_Y=" + support::cpp11::to_string(padding_right.y()));
-    _kernel = create_kernel(compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(
+        compile_context, "space_to_batch_static_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+        build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
     ICLKernel::configure_internal(win);
 }
 
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const ITensorInfo *block_shape,
+                                           const ITensorInfo *paddings,
+                                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
     return Status{};
 }
-Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const int          block_shape_x,
+                                           const int          block_shape_y,
+                                           const Size2D      &padding_left,
+                                           const Size2D      &padding_right,
                                            const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
     return Status{};
 }
 
@@ -218,7 +261,6 @@
         add_3D_tensor_argument(idx, _output, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
index 4817cfe..f9dce9d 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETOBATCHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -63,7 +64,11 @@
      * @param[in]  paddings        2-D tensor with shape [2, M] (First dimension is the fastest-changing dimension). Supported M: 2. Data types supported: S32
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const ICLTensor        *block_shape,
+                   const ICLTensor        *paddings,
+                   ICLTensor              *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -73,7 +78,12 @@
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output);
+    void configure(const ICLTensor *input,
+                   const int        block_shape_x,
+                   const int        block_shape_y,
+                   const Size2D    &padding_left,
+                   const Size2D    &padding_right,
+                   ICLTensor       *output);
     /** Initialise the kernel's input and output. (Static block shape and paddings)
      *
      * @param[in]  compile_context The compile context to be used.
@@ -84,8 +94,13 @@
      * @param[in]  padding_right   The padding at the end of every dimension of the output tensor.
      * @param[out] output          Tensor output. Data types supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                   ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   const int               block_shape_x,
+                   const int               block_shape_y,
+                   const Size2D           &padding_left,
+                   const Size2D           &padding_right,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -95,7 +110,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToBatchLayerKernel (Static block shape and paddings)
      *
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -107,7 +125,12 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
index c5ffdb5..25662b5 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -45,7 +46,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -64,8 +65,7 @@
 }
 } // namespace
 
-CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel()
-    : _input(nullptr), _output(nullptr), _block_shape()
+CLSpaceToDepthLayerKernel::CLSpaceToDepthLayerKernel() : _input(nullptr), _output(nullptr), _block_shape()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -75,10 +75,13 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayerKernel::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          int32_t                 block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
+    auto padding_info = get_padding_info({input, output});
 
     TensorShape output_shape = compute_space_to_depth_shape(input->info(), block_shape);
     auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type());
@@ -94,11 +97,14 @@
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(output->info()->data_type())));
     build_opts.add_option("-DCHANNEL_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_channel)));
     build_opts.add_option("-DBLOCK_SHAPE=" + support::cpp11::to_string(block_shape));
     build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
-    _kernel = create_kernel(compile_context, "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options());
+    _kernel = create_kernel(compile_context,
+                            "space_to_depth_" + lower_string(string_from_data_layout(input->info()->data_layout())),
+                            build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -136,7 +142,6 @@
         enqueue(queue, *this, slice_out, lws_hint());
 
         ++batch_id;
-    }
-    while(window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice_out));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
index bb1ac5f..d093291 100644
--- a/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h
+++ b/src/core/CL/kernels/CLSpaceToDepthLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CLSPACETODEPTHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -61,7 +62,8 @@
      * @param[out] output          Tensor output. Data types supported: same as @p input
      * @param[in]  block_shape     Block shape value.
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
+    void
+    configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape);
     /** Static function to check if given info will lead to a valid configuration of @ref CLSpaceToDepthLayerKernel.
      *
      * @param[in] input       Tensor input info. Supported tensor rank: 4. Data types supported: All.

diff --git a/src/core/CL/kernels/CLStackLayerKernel.cpp b/src/core/CL/kernels/CLStackLayerKernel.cpp
index 075c93a..23e2671 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.cpp
+++ b/src/core/CL/kernels/CLStackLayerKernel.cpp

@@ -30,10 +30,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/StringSupport.h"
 
 using namespace arm_compute::misc::shape_calculator;
@@ -42,7 +42,11 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          unsigned int       axis,
+                          unsigned int       idx_input,
+                          unsigned int       num_tensors,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
@@ -51,9 +55,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_stack_shape(*input, axis, num_tensors));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -61,7 +66,8 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
 {
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -73,18 +79,23 @@
 }
 } // namespace
 
-CLStackLayerKernel::CLStackLayerKernel()
-    : _input(nullptr), _output(nullptr)
+CLStackLayerKernel::CLStackLayerKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLStackLayerKernel::configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(
+    const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, idx_input, num_tensors, output);
 }
 
-void CLStackLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output)
+void CLStackLayerKernel::configure(const CLCompileContext &compile_context,
+                                   const ICLTensor        *input,
+                                   unsigned int            axis,
+                                   unsigned int            idx_input,
+                                   unsigned int            num_tensors,
+                                   ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -112,10 +123,15 @@
     _kernel.setArg<cl_uint>(idx, idx_input);
 }
 
-Status CLStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status CLStackLayerKernel::validate(const ITensorInfo *input,
+                                    unsigned int       axis,
+                                    unsigned int       idx_input,
+                                    unsigned int       num_tensors,
+                                    const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
     return Status{};
 }
 

diff --git a/src/core/CL/kernels/CLStackLayerKernel.h b/src/core/CL/kernels/CLStackLayerKernel.h
index 2865127..d3c17f5 100644
--- a/src/core/CL/kernels/CLStackLayerKernel.h
+++ b/src/core/CL/kernels/CLStackLayerKernel.h

@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_CLSTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -60,7 +61,8 @@
      * @param[out] output      Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(
+        const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
     /** Initialise the kernel's inputs and output
      *
      * @note Supported input tensor rank: up to 4
@@ -74,7 +76,12 @@
      * @param[out] output          Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ICLTensor *output);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   unsigned int            axis,
+                   unsigned int            idx_input,
+                   unsigned int            num_tensors,
+                   ICLTensor              *output);
     /** Static function to check if given info will lead to a valid configuration of @ref CLStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
@@ -88,7 +95,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           unsigned int       axis,
+                           unsigned int       idx_input,
+                           unsigned int       num_tensors,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLStridedSliceKernel.cpp b/src/core/CL/kernels/CLStridedSliceKernel.cpp
index 9acbafd..a8f6112 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.cpp
+++ b/src/core/CL/kernels/CLStridedSliceKernel.cpp

@@ -22,11 +22,13 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLStridedSliceKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/bit_ops.h"
@@ -37,9 +39,14 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const Coordinates &starts,
+                          const Coordinates &ends,
+                          const BiStrides   &strides,
+                          int32_t            begin_mask,
+                          int32_t            end_mask,
+                          int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -48,19 +55,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
-    {
-        return i == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
 
     // Get expected output shape
-    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                          starts, ends, strides,
-                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
 
     // Checks output if configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -76,28 +80,33 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void CLStridedSliceKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSliceKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *input,
+                                     ITensorInfo            *output,
+                                     const Coordinates      &starts,
+                                     const Coordinates      &ends,
+                                     const BiStrides        &strides,
+                                     int32_t                 begin_mask,
+                                     int32_t                 end_mask,
+                                     int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    auto padding_info = get_padding_info({ input, output });
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    auto padding_info = get_padding_info({input, output});
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     const TensorShape &input_shape = input->tensor_shape();
 
     Coordinates starts_abs;
     Coordinates ends_abs;
     Coordinates final_strides;
-    std::tie(starts_abs, ends_abs, final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
-                                                        input_shape,
-                                                        starts, ends, strides,
-                                                        begin_mask, end_mask, shrink_axis_mask);
+    std::tie(starts_abs, ends_abs, final_strides) =
+        arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+                                                                               begin_mask, end_mask, shrink_axis_mask);
 
     // Configure kernel window
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                      starts, ends, strides,
-                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
     Window win = calculate_max_window(*output, Steps());
 
@@ -108,29 +117,33 @@
     const bool multi_access_x = !is_shrink_on_x && (final_strides.x() == 1) && (output_width_x / vec_size_x > 0);
 
     // Update window if needed
-    if(multi_access_x)
+    if (multi_access_x)
     {
         Window &updated_window = win;
         updated_window.set(Window::DimX,
-                           Window::Dimension(updated_window.x().start(), ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
+                           Window::Dimension(updated_window.x().start(),
+                                             ceil_to_multiple(updated_window.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(input->data_type())));
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
-        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(starts_abs[i]));
-        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" + support::cpp11::to_string(final_strides[i]));
+        build_opts.add_option("-DSTART_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(starts_abs[i]));
+        build_opts.add_option("-DSTRIDE_" + support::cpp11::to_string(i) + "=" +
+                              support::cpp11::to_string(final_strides[i]));
         build_opts.add_option_if(is_shrink, "-DSHRINK_" + support::cpp11::to_string(i));
     }
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option_if_else(input_shape.num_dimensions() > 2,
-                                  "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()),
-                                  "-DSRC_DEPTH=1");
+                                  "-DSRC_DEPTH=" + support::cpp11::to_string(input_shape.z()), "-DSRC_DEPTH=1");
     build_opts.add_option_if_else(output->num_dimensions() > 2,
                                   "-DDST_DEPTH=" + support::cpp11::to_string(output->tensor_shape().z()),
                                   "-DDST_DEPTH=1");
@@ -142,7 +155,7 @@
     _config_id = "strided_slice";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->data_type()));
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->dimension(i));
@@ -156,11 +169,17 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status CLStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSliceKernel::validate(const ITensorInfo *input,
+                                      const ITensorInfo *output,
+                                      const Coordinates &starts,
+                                      const Coordinates &ends,
+                                      const BiStrides   &strides,
+                                      int32_t            begin_mask,
+                                      int32_t            end_mask,
+                                      int32_t            shrink_axis_mask)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
 
     return Status{};
 }
@@ -170,8 +189,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_4D();
@@ -182,7 +202,6 @@
         add_4D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_4D(slice));
+    } while (window_collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLStridedSliceKernel.h b/src/core/CL/kernels/CLStridedSliceKernel.h
index 4c20150..1cf5bca 100644
--- a/src/core/CL/kernels/CLStridedSliceKernel.h
+++ b/src/core/CL/kernels/CLStridedSliceKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_STRIDED_SLICE_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 #include <cstdint>
@@ -53,9 +54,15 @@
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   ITensorInfo            *output,
+                   const Coordinates      &starts,
+                   const Coordinates      &ends,
+                   const BiStrides        &strides,
+                   int32_t                 begin_mask,
+                   int32_t                 end_mask,
+                   int32_t                 shrink_axis_mask);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -71,9 +78,14 @@
      * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask,
+                           int32_t            end_mask,
+                           int32_t            shrink_axis_mask);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/core/CL/kernels/CLTileKernel.cpp b/src/core/CL/kernels/CLTileKernel.cpp
index 3e7015c..fa996c4 100644
--- a/src/core/CL/kernels/CLTileKernel.cpp
+++ b/src/core/CL/kernels/CLTileKernel.cpp

@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLTileKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"
@@ -39,15 +41,13 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -55,8 +55,7 @@
 }
 } // namespace
 
-CLTileKernel::CLTileKernel()
-    : _input(nullptr), _output(nullptr)
+CLTileKernel::CLTileKernel() : _input(nullptr), _output(nullptr)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -66,7 +65,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTileKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTileKernel::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *output,
+                             const Multiples        &multiples)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -104,15 +106,14 @@
     // Configure window without padding
     Window win = calculate_max_window(*output->info());
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         // If multi-access is enabled, no thread should cross the tile boundaries. This means we need
         // as many threads as those to cover a single tile times multiples[0]. Note that if threads
         // do not cross the boundaries of the tiles, they won't cross the boundaries of the last tile, and
         // we don't need to pad the output
         const unsigned int size_win_x = ceil_to_multiple(input->info()->dimension(0), vec_size_x) * multiples[0];
-        win.set(Window::DimX,
-                Window::Dimension(win.x().start(), size_win_x, vec_size_x));
+        win.set(Window::DimX, Window::Dimension(win.x().start(), size_win_x, vec_size_x));
     }
 
     ICLKernel::configure_internal(win);
@@ -121,7 +122,7 @@
     _config_id = "tile";
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(input->info()->data_type()));
-    for(unsigned int i = 0; i < multiples.size(); ++i)
+    for (unsigned int i = 0; i < multiples.size(); ++i)
     {
         _config_id += "_";
         _config_id += support::cpp11::to_string(input->info()->dimension(i));
@@ -150,7 +151,6 @@
         add_4D_tensor_argument(idx, _input, slice);
         add_4D_tensor_argument(idx, _output, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_4D(slice));
+    } while (collapsed.slide_window_slice_4D(slice));
 }
 } // namespace arm_compute

diff --git a/src/core/CL/kernels/CLTileKernel.h b/src/core/CL/kernels/CLTileKernel.h
index 41752ca..c3486ae 100644
--- a/src/core/CL/kernels/CLTileKernel.h
+++ b/src/core/CL/kernels/CLTileKernel.h

@@ -64,7 +64,10 @@
      * @param[out] output          Destination tensor. Same as @p input
      *
      */
-    void configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples);
+    void configure(const CLCompileContext &compile_context,
+                   const ICLTensor        *input,
+                   ICLTensor              *output,
+                   const Multiples        &multiples);
     /** Static function to check if given info will lead to a valid configuration of @ref CLTileKernel
      *
      * @param[in] input     Source tensor info. Data type supported: All.

diff --git a/src/core/CPP/CPPTypes.cpp b/src/core/CPP/CPPTypes.cpp
index 6a3f66f..9980db4 100644
--- a/src/core/CPP/CPPTypes.cpp
+++ b/src/core/CPP/CPPTypes.cpp

@@ -25,6 +25,7 @@
 #include "arm_compute/core/CPP/CPPTypes.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/common/cpuinfo/CpuInfo.h"
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 
@@ -43,8 +44,7 @@
     return _cpuinfo;
 }
 
-CPUInfo::CPUInfo()
-    : _impl(std::make_unique<Impl>())
+CPUInfo::CPUInfo() : _impl(std::make_unique<Impl>())
 {
     _impl->info = cpuinfo::CpuInfo::build();
 }

diff --git a/src/core/CPP/Validate.h b/src/core/CPP/Validate.h
index df192b5..fe25350 100644
--- a/src/core/CPP/Validate.h
+++ b/src/core/CPP/Validate.h

@@ -38,8 +38,8 @@
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
 {
     bool fp16_kernels_enabled = false;
 #if defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS)
@@ -47,8 +47,9 @@
 #endif /* defined(ARM_COMPUTE_ENABLE_FP16) && defined(ENABLE_FP16_KERNELS) */
 
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled),
-                                        function, file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::F16) && (!CPUInfo::get().has_fp16() || !fp16_kernels_enabled), function,
+        file, line, "This CPU architecture does not support F16 data type, you need v8.2 or above");
     return Status{};
 }
 
@@ -61,8 +62,8 @@
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensorInfo *tensor_info)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensorInfo *tensor_info)
 {
     bool bf16_kernels_enabled = false;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
@@ -70,8 +71,9 @@
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor_info == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG((tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
-                                        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(
+        (tensor_info->data_type() == DataType::BFLOAT16) && (!CPUInfo::get().has_bf16() || !bf16_kernels_enabled),
+        function, file, line, "This CPU architecture does not support BFloat16 data type, you need v8.6 or above");
     return Status{};
 }
 
@@ -84,8 +86,8 @@
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_fp16(const char *function, const char *file, const int line, const ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_fp16(function, file, line, tensor->info()));
@@ -101,8 +103,8 @@
  *
  * @return Status
  */
-inline Status error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line,
-                                            const ITensor *tensor)
+inline Status
+error_on_unsupported_cpu_bf16(const char *function, const char *file, const int line, const ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ON_ERROR(::arm_compute::error_on_unsupported_cpu_bf16(function, file, line, tensor->info()));

diff --git a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
index 0f405d8..02686eb 100644
--- a/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp
+++ b/src/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/core/CPP/kernels/CPPBoxWithNonMaximaSuppressionLimitKernel.h"
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <algorithm>
@@ -34,7 +35,11 @@
 namespace
 {
 template <typename T>
-std::vector<int> SoftNMS(const ITensor *proposals, std::vector<std::vector<T>> &scores_in, std::vector<int> inds, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> SoftNMS(const ITensor               *proposals,
+                         std::vector<std::vector<T>> &scores_in,
+                         std::vector<int>             inds,
+                         const BoxNMSLimitInfo       &info,
+                         int                          class_id)
 {
     std::vector<int> keep;
     const int        proposals_width = proposals->info()->dimension(1);
@@ -45,7 +50,7 @@
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -56,13 +61,13 @@
 
     // Note: Soft NMS scores have already been initialized with input scores
 
-    while(!inds.empty())
+    while (!inds.empty())
     {
         // Find proposal with max score among remaining proposals
         int max_pos = 0;
-        for(unsigned int i = 1; i < inds.size(); ++i)
+        for (unsigned int i = 1; i < inds.size(); ++i)
         {
-            if(scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
+            if (scores_in[class_id][inds.at(i)] > scores_in[class_id][inds.at(max_pos)])
             {
                 max_pos = i;
             }
@@ -75,7 +80,7 @@
         inds.erase(inds.begin());
 
         std::vector<int> sorted_indices_temp;
-        for(auto idx : inds)
+        for (auto idx : inds)
         {
             const auto xx1 = std::max(x1[idx], x1[element]);
             const auto yy1 = std::max(y1[idx], y1[element]);
@@ -89,7 +94,7 @@
 
             // Update scores based on computed IoU, overlap threshold and NMS method
             T weight;
-            switch(info.soft_nms_method())
+            switch (info.soft_nms_method())
             {
                 case NMSType::LINEAR:
                     weight = (ovr > info.nms()) ? (1.f - ovr) : 1.f;
@@ -106,7 +111,7 @@
 
             // Discard boxes with new scores below min threshold and update pending indices
             scores_in[class_id][idx] *= weight;
-            if(scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
+            if (scores_in[class_id][idx] >= info.soft_nms_min_score_thres())
             {
                 sorted_indices_temp.push_back(idx);
             }
@@ -118,7 +123,10 @@
 }
 
 template <typename T>
-std::vector<int> NonMaximaSuppression(const ITensor *proposals, std::vector<int> sorted_indices, const BoxNMSLimitInfo &info, int class_id)
+std::vector<int> NonMaximaSuppression(const ITensor         *proposals,
+                                      std::vector<int>       sorted_indices,
+                                      const BoxNMSLimitInfo &info,
+                                      int                    class_id)
 {
     std::vector<int> keep;
 
@@ -130,7 +138,7 @@
     std::vector<T> y2(proposals_width);
     std::vector<T> areas(proposals_width);
 
-    for(int i = 0; i < proposals_width; ++i)
+    for (int i = 0; i < proposals_width; ++i)
     {
         x1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4, i)));
         y1[i]    = *reinterpret_cast<T *>(proposals->ptr_to_element(Coordinates(class_id * 4 + 1, i)));
@@ -139,7 +147,7 @@
         areas[i] = (x2[i] - x1[i] + 1.0) * (y2[i] - y1[i] + 1.0);
     }
 
-    while(!sorted_indices.empty())
+    while (!sorted_indices.empty())
     {
         int i = sorted_indices.at(0);
         keep.push_back(i);
@@ -148,7 +156,7 @@
         std::vector<int> new_indices;
         sorted_indices_temp.erase(sorted_indices_temp.begin());
 
-        for(unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
+        for (unsigned int j = 0; j < sorted_indices_temp.size(); ++j)
         {
             const float xx1 = std::max(x1[sorted_indices_temp.at(j)], x1[i]);
             const float yy1 = std::max(y1[sorted_indices_temp.at(j)], y1[i]);
@@ -163,8 +171,9 @@
             const float ctr_y = yy1 + (h / 2);
 
             // If suppress_size is specified, filter the boxes based on their size and position
-            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() && ctr_x < info.im_width() && ctr_y < info.im_height());
-            if(ovr <= info.nms() && keep_size)
+            const bool keep_size = !info.suppress_size() || (w >= info.min_size() && h >= info.min_size() &&
+                                                             ctr_x < info.im_width() && ctr_y < info.im_height());
+            if (ovr <= info.nms() && keep_size)
             {
                 new_indices.push_back(j);
             }
@@ -172,7 +181,7 @@
 
         const unsigned int new_indices_size = new_indices.size();
         std::vector<int>   new_sorted_indices(new_indices_size);
-        for(unsigned int i = 0; i < new_indices_size; ++i)
+        for (unsigned int i = 0; i < new_indices_size; ++i)
         {
             new_sorted_indices[i] = sorted_indices[new_indices[i] + 1];
         }
@@ -184,7 +193,15 @@
 } // namespace
 
 CPPBoxWithNonMaximaSuppressionLimitKernel::CPPBoxWithNonMaximaSuppressionLimitKernel()
-    : _scores_in(nullptr), _boxes_in(nullptr), _batch_splits_in(nullptr), _scores_out(nullptr), _boxes_out(nullptr), _classes(nullptr), _batch_splits_out(nullptr), _keeps(nullptr), _keeps_size(nullptr),
+    : _scores_in(nullptr),
+      _boxes_in(nullptr),
+      _batch_splits_in(nullptr),
+      _scores_out(nullptr),
+      _boxes_out(nullptr),
+      _classes(nullptr),
+      _batch_splits_out(nullptr),
+      _keeps(nullptr),
+      _keeps_size(nullptr),
       _info()
 {
 }
@@ -197,7 +214,7 @@
 template <typename T>
 void CPPBoxWithNonMaximaSuppressionLimitKernel::run_nmslimit()
 {
-    const int                     batch_size   = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
+    const int                     batch_size = _batch_splits_in == nullptr ? 1 : _batch_splits_in->info()->dimension(0);
     const int                     num_classes  = _scores_in->info()->dimension(0);
     const int                     scores_count = _scores_in->info()->dimension(1);
     std::vector<int>              total_keep_per_batch(batch_size);
@@ -205,51 +222,48 @@
     int                           total_keep_count = 0;
 
     std::vector<std::vector<T>> in_scores(num_classes, std::vector<T>(scores_count));
-    for(int i = 0; i < scores_count; ++i)
+    for (int i = 0; i < scores_count; ++i)
     {
-        for(int j = 0; j < num_classes; ++j)
+        for (int j = 0; j < num_classes; ++j)
         {
             in_scores[j][i] = *reinterpret_cast<const T *>(_scores_in->ptr_to_element(Coordinates(j, i)));
         }
     }
 
     int cur_start_idx = 0;
-    for(int b = 0; b < batch_size; ++b)
+    for (int b = 0; b < batch_size; ++b)
     {
         // Skip first class if there is more than 1 except if the number of classes is 1.
         const int j_start = (num_classes == 1 ? 0 : 1);
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
             std::vector<T>   cur_scores(scores_count);
             std::vector<int> inds;
-            for(int i = 0; i < scores_count; ++i)
+            for (int i = 0; i < scores_count; ++i)
             {
                 const T score = in_scores[j][i];
                 cur_scores[i] = score;
 
-                if(score > _info.score_thresh())
+                if (score > _info.score_thresh())
                 {
                     inds.push_back(i);
                 }
             }
-            if(_info.soft_nms_enabled())
+            if (_info.soft_nms_enabled())
             {
                 keeps[j] = SoftNMS(_boxes_in, in_scores, inds, _info, j);
             }
             else
             {
                 std::sort(inds.data(), inds.data() + inds.size(),
-                          [&cur_scores](int lhs, int rhs)
-                {
-                    return cur_scores[lhs] > cur_scores[rhs];
-                });
+                          [&cur_scores](int lhs, int rhs) { return cur_scores[lhs] > cur_scores[rhs]; });
 
                 keeps[j] = NonMaximaSuppression<T>(_boxes_in, inds, _info, j);
             }
             total_keep_count += keeps[j].size();
         }
 
-        if(_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
+        if (_info.detections_per_im() > 0 && total_keep_count > _info.detections_per_im())
         {
             // merge all scores (represented by indices) together and sort
             auto get_all_scores_sorted = [&in_scores, &keeps, total_keep_count]()
@@ -257,10 +271,10 @@
                 std::vector<T> ret(total_keep_count);
 
                 int ret_idx = 0;
-                for(unsigned int i = 1; i < keeps.size(); ++i)
+                for (unsigned int i = 1; i < keeps.size(); ++i)
                 {
                     auto &cur_keep = keeps[i];
-                    for(auto &ckv : cur_keep)
+                    for (auto &ckv : cur_keep)
                     {
                         ret[ret_idx++] = in_scores[i][ckv];
                     }
@@ -273,13 +287,13 @@
 
             auto    all_scores_sorted = get_all_scores_sorted();
             const T image_thresh      = all_scores_sorted[all_scores_sorted.size() - _info.detections_per_im()];
-            for(int j = 1; j < num_classes; ++j)
+            for (int j = 1; j < num_classes; ++j)
             {
                 auto            &cur_keep = keeps[j];
                 std::vector<int> new_keeps_j;
-                for(auto &k : cur_keep)
+                for (auto &k : cur_keep)
                 {
-                    if(in_scores[j][k] >= image_thresh)
+                    if (in_scores[j][k] >= image_thresh)
                     {
                         new_keeps_j.push_back(k);
                     }
@@ -293,40 +307,52 @@
 
         // Write results
         int cur_out_idx = 0;
-        for(int j = j_start; j < num_classes; ++j)
+        for (int j = j_start; j < num_classes; ++j)
         {
-            auto     &cur_keep        = keeps[j];
-            auto      cur_out_scores  = reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            auto      cur_out_classes = reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
-            const int box_column      = (cur_start_idx + cur_out_idx) * 4;
+            auto &cur_keep = keeps[j];
+            auto  cur_out_scores =
+                reinterpret_cast<T *>(_scores_out->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            auto cur_out_classes =
+                reinterpret_cast<T *>(_classes->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx)));
+            const int box_column = (cur_start_idx + cur_out_idx) * 4;
 
-            for(unsigned int k = 0; k < cur_keep.size(); ++k)
+            for (unsigned int k = 0; k < cur_keep.size(); ++k)
             {
-                cur_out_scores[k]     = in_scores[j][cur_keep[k]];
-                cur_out_classes[k]    = static_cast<T>(j);
-                auto cur_out_box_row0 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
-                auto cur_out_box_row1 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
-                auto cur_out_box_row2 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
-                auto cur_out_box_row3 = reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
-                *cur_out_box_row0     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
-                *cur_out_box_row1     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
-                *cur_out_box_row2     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
-                *cur_out_box_row3     = *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
+                cur_out_scores[k]  = in_scores[j][cur_keep[k]];
+                cur_out_classes[k] = static_cast<T>(j);
+                auto cur_out_box_row0 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 0, k)));
+                auto cur_out_box_row1 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 1, k)));
+                auto cur_out_box_row2 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 2, k)));
+                auto cur_out_box_row3 =
+                    reinterpret_cast<T *>(_boxes_out->ptr_to_element(Coordinates(box_column + 3, k)));
+                *cur_out_box_row0 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 0, cur_keep[k])));
+                *cur_out_box_row1 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 1, cur_keep[k])));
+                *cur_out_box_row2 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 2, cur_keep[k])));
+                *cur_out_box_row3 =
+                    *reinterpret_cast<const T *>(_boxes_in->ptr_to_element(Coordinates(j * 4 + 3, cur_keep[k])));
             }
 
             cur_out_idx += cur_keep.size();
         }
 
-        if(_keeps != nullptr)
+        if (_keeps != nullptr)
         {
             cur_out_idx = 0;
-            for(int j = 0; j < num_classes; ++j)
+            for (int j = 0; j < num_classes; ++j)
             {
-                for(unsigned int i = 0; i < keeps[j].size(); ++i)
+                for (unsigned int i = 0; i < keeps[j].size(); ++i)
                 {
-                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) = static_cast<T>(keeps[j].at(i));
+                    *reinterpret_cast<T *>(_keeps->ptr_to_element(Coordinates(cur_start_idx + cur_out_idx + i))) =
+                        static_cast<T>(keeps[j].at(i));
                 }
-                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) = keeps[j].size();
+                *reinterpret_cast<uint32_t *>(_keeps_size->ptr_to_element(Coordinates(j + b * num_classes))) =
+                    keeps[j].size();
                 cur_out_idx += keeps[j].size();
             }
         }
@@ -334,17 +360,25 @@
         cur_start_idx += total_keep_count;
     }
 
-    if(_batch_splits_out != nullptr)
+    if (_batch_splits_out != nullptr)
     {
-        for(int b = 0; b < batch_size; ++b)
+        for (int b = 0; b < batch_size; ++b)
         {
             *reinterpret_cast<float *>(_batch_splits_out->ptr_to_element(Coordinates(b))) = total_keep_per_batch[b];
         }
     }
 }
 
-void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in, ITensor *scores_out, ITensor *boxes_out, ITensor *classes,
-                                                          ITensor *batch_splits_out, ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimitKernel::configure(const ITensor        *scores_in,
+                                                          const ITensor        *boxes_in,
+                                                          const ITensor        *batch_splits_in,
+                                                          ITensor              *scores_out,
+                                                          ITensor              *boxes_out,
+                                                          ITensor              *classes,
+                                                          ITensor              *batch_splits_out,
+                                                          ITensor              *keeps,
+                                                          ITensor              *keeps_size,
+                                                          const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
     ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::F16, DataType::F32);
@@ -352,25 +386,28 @@
     const unsigned int num_classes = scores_in->info()->dimension(0);
 
     ARM_COMPUTE_UNUSED(num_classes);
-    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0), "First dimension of input boxes must be of size 4*num_classes");
-    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1), "Input scores and input boxes must have the same number of rows");
+    ARM_COMPUTE_ERROR_ON_MSG((4 * num_classes) != boxes_in->info()->dimension(0),
+                             "First dimension of input boxes must be of size 4*num_classes");
+    ARM_COMPUTE_ERROR_ON_MSG(scores_in->info()->dimension(1) != boxes_in->info()->dimension(1),
+                             "Input scores and input boxes must have the same number of rows");
 
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != boxes_out->info()->dimension(1));
     ARM_COMPUTE_ERROR_ON(boxes_out->info()->dimension(0) != 4);
     ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != classes->info()->dimension(0));
-    if(keeps != nullptr)
+    if (keeps != nullptr)
     {
-        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr, "keeps_size cannot be nullptr if keeps has to be provided as output");
+        ARM_COMPUTE_ERROR_ON_MSG(keeps_size == nullptr,
+                                 "keeps_size cannot be nullptr if keeps has to be provided as output");
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, keeps);
         ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(keeps_size, 1, DataType::U32);
         ARM_COMPUTE_ERROR_ON(scores_out->info()->dimension(0) != keeps->info()->dimension(0));
         ARM_COMPUTE_ERROR_ON(num_classes != keeps_size->info()->dimension(0));
     }
-    if(batch_splits_in != nullptr)
+    if (batch_splits_in != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_in);
     }
-    if(batch_splits_out != nullptr)
+    if (batch_splits_out != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(scores_in, batch_splits_out);
     }
@@ -399,7 +436,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IKernel::window(), window);
 
-    switch(_scores_in->info()->data_type())
+    switch (_scores_in->info()->data_type())
     {
         case DataType::F32:
             run_nmslimit<float>();

diff --git a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
index c1187ff..1224ec1 100644
--- a/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp
+++ b/src/core/CPP/kernels/CPPNonMaximumSuppressionKernel.cpp

@@ -35,15 +35,22 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices, unsigned int max_output_size,
-                          const float score_threshold, const float iou_threshold)
+Status validate_arguments(const ITensorInfo *bboxes,
+                          const ITensorInfo *scores,
+                          const ITensorInfo *output_indices,
+                          unsigned int       max_output_size,
+                          const float        score_threshold,
+                          const float        iou_threshold)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(bboxes, scores, output_indices);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bboxes, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_indices, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2, "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1, "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1, "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bboxes->num_dimensions() > 2,
+                                    "The bboxes tensor must be a 2-D float tensor of shape [4, num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(scores->num_dimensions() > 1,
+                                    "The scores tensor must be a 1-D float tensor of shape [num_boxes].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->num_dimensions() > 1,
+                                    "The indices must be 1-D integer tensor of shape [M], where max_output_size <= M");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bboxes, scores);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_indices->dimension(0) == 0, "Indices tensor must be bigger than 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(max_output_size == 0, "Max size cannot be 0");
@@ -55,15 +62,26 @@
 } // namespace
 
 CPPNonMaximumSuppressionKernel::CPPNonMaximumSuppressionKernel()
-    : _input_bboxes(nullptr), _input_scores(nullptr), _output_indices(nullptr), _max_output_size(0), _score_threshold(0.f), _iou_threshold(0.f), _num_boxes(0)
+    : _input_bboxes(nullptr),
+      _input_scores(nullptr),
+      _output_indices(nullptr),
+      _max_output_size(0),
+      _score_threshold(0.f),
+      _iou_threshold(0.f),
+      _num_boxes(0)
 {
 }
 
-void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes, const ITensor *input_scores, ITensor *output_indices,
-                                               unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+void CPPNonMaximumSuppressionKernel::configure(const ITensor *input_bboxes,
+                                               const ITensor *input_scores,
+                                               ITensor       *output_indices,
+                                               unsigned int   max_output_size,
+                                               const float    score_threshold,
+                                               const float    iou_threshold)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_bboxes, input_scores, output_indices);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(), max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_bboxes->info(), input_scores->info(), output_indices->info(),
+                                                  max_output_size, score_threshold, iou_threshold));
 
     auto_init_if_empty(*output_indices->info(), TensorShape(max_output_size), 1, DataType::U8, QuantizationInfo());
 
@@ -82,10 +100,15 @@
     ICPPKernel::configure(win);
 }
 
-Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *output_indices,
-                                                unsigned int max_output_size, const float score_threshold, const float iou_threshold)
+Status CPPNonMaximumSuppressionKernel::validate(const ITensorInfo *bboxes,
+                                                const ITensorInfo *scores,
+                                                const ITensorInfo *output_indices,
+                                                unsigned int       max_output_size,
+                                                const float        score_threshold,
+                                                const float        iou_threshold)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(bboxes, scores, output_indices, max_output_size, score_threshold, iou_threshold));
     return Status{};
 }
 
@@ -99,10 +122,10 @@
     // Auxiliary tensors
     std::vector<int>   indices_above_thd;
     std::vector<float> scores_above_thd;
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         const float score_i = *(reinterpret_cast<float *>(_input_scores->ptr_to_element(Coordinates(i))));
-        if(score_i >= _score_threshold)
+        if (score_i >= _score_threshold)
         {
             scores_above_thd.emplace_back(score_i);
             indices_above_thd.emplace_back(i);
@@ -114,12 +137,9 @@
     std::vector<unsigned int> sorted_indices;
     sorted_indices.resize(num_above_thd);
     std::iota(sorted_indices.data(), sorted_indices.data() + num_above_thd, 0);
-    std::sort(std::begin(sorted_indices),
-              std::end(sorted_indices),
+    std::sort(std::begin(sorted_indices), std::end(sorted_indices),
               [&](unsigned int first, unsigned int second)
-    {
-        return scores_above_thd[first] > scores_above_thd[second];
-    });
+              { return scores_above_thd[first] > scores_above_thd[second]; });
 
     // Number of output is the minimum between max_detection and the scores above the threshold
     const unsigned int num_output = std::min(_max_output_size, num_above_thd);
@@ -127,19 +147,20 @@
     std::vector<bool>  visited(num_above_thd, false);
 
     // Keep only boxes with small IoU
-    for(unsigned int i = 0; i < num_above_thd; ++i)
+    for (unsigned int i = 0; i < num_above_thd; ++i)
     {
         // Check if the output is full
-        if(output_idx >= num_output)
+        if (output_idx >= num_output)
         {
             break;
         }
 
         // Check if it was already visited, if not add it to the output and update the indices counter
-        if(!visited[sorted_indices[i]])
+        if (!visited[sorted_indices[i]])
         {
-            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = indices_above_thd[sorted_indices[i]];
-            visited[sorted_indices[i]]                                                           = true;
+            *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) =
+                indices_above_thd[sorted_indices[i]];
+            visited[sorted_indices[i]] = true;
             ++output_idx;
         }
         else
@@ -148,28 +169,36 @@
         }
 
         // Once added one element at the output check if the next ones overlap and can be skipped
-        for(unsigned int j = i + 1; j < num_above_thd; ++j)
+        for (unsigned int j = i + 1; j < num_above_thd; ++j)
         {
-            if(!visited[sorted_indices[j]])
+            if (!visited[sorted_indices[j]])
             {
                 // Calculate IoU
                 const unsigned int i_index = indices_above_thd[sorted_indices[i]];
                 const unsigned int j_index = indices_above_thd[sorted_indices[j]];
                 // Box-corner format: xmin, ymin, xmax, ymax
-                const auto box_i_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
-                const auto box_i_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
-                const auto box_i_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
-                const auto box_i_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
+                const auto box_i_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, i_index))));
+                const auto box_i_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, i_index))));
+                const auto box_i_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, i_index))));
+                const auto box_i_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, i_index))));
 
-                const auto box_j_xmin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
-                const auto box_j_ymin = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
-                const auto box_j_xmax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
-                const auto box_j_ymax = *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
+                const auto box_j_xmin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(0, j_index))));
+                const auto box_j_ymin =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(1, j_index))));
+                const auto box_j_xmax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(2, j_index))));
+                const auto box_j_ymax =
+                    *(reinterpret_cast<float *>(_input_bboxes->ptr_to_element(Coordinates(3, j_index))));
 
                 const float area_i = (box_i_xmax - box_i_xmin) * (box_i_ymax - box_i_ymin);
                 const float area_j = (box_j_xmax - box_j_xmin) * (box_j_ymax - box_j_ymin);
                 float       overlap;
-                if(area_i <= 0 || area_j <= 0)
+                if (area_i <= 0 || area_j <= 0)
                 {
                     overlap = 0.0f;
                 }
@@ -179,11 +208,12 @@
                     const auto x_min_intersection = std::max<float>(box_i_xmin, box_j_xmin);
                     const auto y_max_intersection = std::min<float>(box_i_ymax, box_j_ymax);
                     const auto x_max_intersection = std::min<float>(box_i_xmax, box_j_xmax);
-                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) * std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
-                    overlap                       = area_intersection / (area_i + area_j - area_intersection);
+                    const auto area_intersection  = std::max<float>(y_max_intersection - y_min_intersection, 0.0f) *
+                                                   std::max<float>(x_max_intersection - x_min_intersection, 0.0f);
+                    overlap = area_intersection / (area_i + area_j - area_intersection);
                 }
 
-                if(overlap > _iou_threshold)
+                if (overlap > _iou_threshold)
                 {
                     visited[sorted_indices[j]] = true;
                 }
@@ -192,7 +222,7 @@
     }
     // The output could be full but not the output indices tensor
     // Instead return values not valid we put -1
-    for(; output_idx < _max_output_size; ++output_idx)
+    for (; output_idx < _max_output_size; ++output_idx)
     {
         *(reinterpret_cast<int *>(_output_indices->ptr_to_element(Coordinates(output_idx)))) = -1;
     }

diff --git a/src/core/CPP/kernels/CPPPermuteKernel.cpp b/src/core/CPP/kernels/CPPPermuteKernel.cpp
index 054c7bf..e68090d 100644
--- a/src/core/CPP/kernels/CPPPermuteKernel.cpp
+++ b/src/core/CPP/kernels/CPPPermuteKernel.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -43,7 +44,7 @@
     const TensorShape output_shape = misc::shape_calculator::compute_permutation_output_shape(*input, perm);
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -65,7 +66,7 @@
     // Create output window
     Window                  window_out(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= _perm.num_dimensions(); ++d)
+    for (size_t d = 0; d <= _perm.num_dimensions(); ++d)
     {
         window_out.set(d, zero_window);
     }
@@ -74,28 +75,32 @@
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    if(_input->info()->num_dimensions() <= 3)
+    if (_input->info()->num_dimensions() <= 3)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
-    else if(_input->info()->num_dimensions() >= 4)
+    else if (_input->info()->num_dimensions() >= 4)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                             = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_strides[3];
-            *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
-        },
-        in, out);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] +
+                                id[3] * perm_strides[3];
+                *(reinterpret_cast<T *>(out.ptr() + idx)) = *(reinterpret_cast<const T *>(in.ptr()));
+            },
+            in, out);
     }
 }
 
-CPPPermuteKernel::CPPPermuteKernel()
-    : _func(), _input(nullptr), _output(nullptr), _perm()
+CPPPermuteKernel::CPPPermuteKernel() : _func(), _input(nullptr), _output(nullptr), _perm()
 {
 }
 
@@ -113,7 +118,7 @@
     _output = output;
     _perm   = perm;
 
-    switch(input->info()->element_size())
+    switch (input->info()->element_size())
     {
         case 1:
             _func = &CPPPermuteKernel::run_permute<uint8_t>;
@@ -152,7 +157,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_func != nullptr)
+    if (_func != nullptr)
     {
         (this->*_func)(window);
     }

diff --git a/src/core/CPP/kernels/CPPTopKVKernel.cpp b/src/core/CPP/kernels/CPPTopKVKernel.cpp
index d2b54e4..6ffb68e 100644
--- a/src/core/CPP/kernels/CPPTopKVKernel.cpp
+++ b/src/core/CPP/kernels/CPPTopKVKernel.cpp

@@ -34,32 +34,34 @@
 {
 namespace
 {
-template <typename T,
-          typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
+template <typename T, typename std::enable_if<utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     const T epsilon = std::numeric_limits<T>::epsilon();
     return (a - b > epsilon);
 }
 
-template < typename T,
-           typename std::enable_if < !utils::traits::is_floating_point<T>::value, int >::type = 0 >
+template <typename T, typename std::enable_if<!utils::traits::is_floating_point<T>::value, int>::type = 0>
 inline bool greater_than(T a, T b)
 {
     return (a > b);
 }
 
-Status validate_arguments(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status validate_arguments(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
 {
     ARM_COMPUTE_UNUSED(k);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(predictions, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(targets, 1, DataType::U32);
 
     ARM_COMPUTE_RETURN_ERROR_ON(predictions->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(targets->dimension(0) != predictions->dimension(1));
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), targets->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8);
@@ -72,22 +74,23 @@
 template <typename T>
 void CPPTopKVKernel::run_topkv()
 {
-    for(unsigned int i = 0; i < _batch_size; ++i)
+    for (unsigned int i = 0; i < _batch_size; ++i)
     {
-        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{ i }));
-        const auto predicted_value = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ target_class_id, i }));
+        const auto target_class_id = *reinterpret_cast<uint32_t *>(_targets->ptr_to_element(Coordinates{i}));
+        const auto predicted_value =
+            *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{target_class_id, i}));
 
         // The variable rank indicates how many values there are before the target_class_id
         unsigned int rank = 0;
-        for(unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
+        for (unsigned int j = 0; (j < _num_classes) && (rank < _k); ++j)
         {
-            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{ j, i }));
-            if(greater_than(current_prediction, predicted_value))
+            const auto current_prediction = *reinterpret_cast<T *>(_predictions->ptr_to_element(Coordinates{j, i}));
+            if (greater_than(current_prediction, predicted_value))
             {
                 rank++;
             }
         }
-        *(_output->ptr_to_element(Coordinates{ i })) = static_cast<uint8_t>(rank < _k);
+        *(_output->ptr_to_element(Coordinates{i})) = static_cast<uint8_t>(rank < _k);
     }
 }
 
@@ -96,7 +99,10 @@
 {
 }
 
-void CPPTopKVKernel::configure(const ITensor *predictions, const ITensor *targets, ITensor *output, const unsigned int k)
+void CPPTopKVKernel::configure(const ITensor     *predictions,
+                               const ITensor     *targets,
+                               ITensor           *output,
+                               const unsigned int k)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(predictions, targets, output);
 
@@ -115,7 +121,10 @@
     ICPPKernel::configure(Window()); // Default 1 iteration window
 }
 
-Status CPPTopKVKernel::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKVKernel::validate(const ITensorInfo *predictions,
+                                const ITensorInfo *targets,
+                                ITensorInfo       *output,
+                                const unsigned int k)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(predictions, targets, output, k));
     return Status{};
@@ -129,7 +138,7 @@
 void CPPTopKVKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(window, info);
-    switch(_predictions->info()->data_type())
+    switch (_predictions->info()->data_type())
     {
         case DataType::F32:
             run_topkv<float>();

diff --git a/src/core/CPP/kernels/CPPUpsampleKernel.cpp b/src/core/CPP/kernels/CPPUpsampleKernel.cpp
index 7ef83fb..b1efe32 100644
--- a/src/core/CPP/kernels/CPPUpsampleKernel.cpp
+++ b/src/core/CPP/kernels/CPPUpsampleKernel.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/core/CPP/kernels/CPPUpsampleKernel.h"
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 #include <cstddef>
@@ -31,8 +32,7 @@
 
 namespace arm_compute
 {
-CPPUpsampleKernel::CPPUpsampleKernel()
-    : _input(nullptr), _output(nullptr), _info()
+CPPUpsampleKernel::CPPUpsampleKernel() : _input(nullptr), _output(nullptr), _info()
 {
 }
 
@@ -82,7 +82,7 @@
     const size_t element_size  = _input->info()->element_size();
 
     // The fill value is normally 0, but for quantized types '0' corresponds to the offset
-    switch(_output->info()->data_type())
+    switch (_output->info()->data_type())
     {
         case DataType::QASYMM8:
         {
@@ -102,7 +102,7 @@
 
     // Create window
     Window window_out(window);
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
         window_out.set(Window::DimX, Window::Dimension(start_width, end_width, stride_width));
         window_out.set(Window::DimY, Window::Dimension(start_height, end_height, stride_height));
@@ -117,10 +117,7 @@
     Iterator in(_input, window);
     Iterator out(_output, window_out);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        memcpy(out.ptr(), in.ptr(), element_size);
-    },
-    in, out);
+    execute_window_loop(
+        window, [&](const Coordinates &) { memcpy(out.ptr(), in.ptr(), element_size); }, in, out);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/Error.cpp b/src/core/Error.cpp
index 5c8d45c..679a93f 100644
--- a/src/core/Error.cpp
+++ b/src/core/Error.cpp

@@ -36,9 +36,10 @@
     return Status(error_code, msg);
 }
 
-Status arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
+Status
+arm_compute::create_error_msg(ErrorCode error_code, const char *func, const char *file, int line, const char *msg)
 {
-    std::array<char, 512> out{ 0 };
+    std::array<char, 512> out{0};
     snprintf(out.data(), out.size(), "in %s %s:%d: %s", func, file, line, msg);
     return Status(error_code, std::string(out.data()));
 }

diff --git a/src/core/GPUTarget.cpp b/src/core/GPUTarget.cpp
index 292acf8..2d1a13c 100644
--- a/src/core/GPUTarget.cpp
+++ b/src/core/GPUTarget.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/GPUTarget.h"
+
 #include "arm_compute/core/Log.h"
 
 #include <map>
@@ -31,47 +32,47 @@
 {
 arm_compute::GPUTarget get_valhall_target(const std::string &version)
 {
-    if(version.find("G77") != std::string::npos)
+    if (version.find("G77") != std::string::npos)
     {
         return arm_compute::GPUTarget::G77;
     }
-    else if(version.find("G57") != std::string::npos)
+    else if (version.find("G57") != std::string::npos)
     {
         return arm_compute::GPUTarget::G57;
     }
-    if(version.find("G68") != std::string::npos)
+    if (version.find("G68") != std::string::npos)
     {
         return arm_compute::GPUTarget::G68;
     }
-    if(version.find("G78AE") != std::string::npos)
+    if (version.find("G78AE") != std::string::npos)
     {
         return arm_compute::GPUTarget::G78AE;
     }
-    if(version.find("G78") != std::string::npos)
+    if (version.find("G78") != std::string::npos)
     {
         return arm_compute::GPUTarget::G78;
     }
-    else if(version.find("G710") != std::string::npos)
+    else if (version.find("G710") != std::string::npos)
     {
         return arm_compute::GPUTarget::G710;
     }
-    else if(version.find("G610") != std::string::npos)
+    else if (version.find("G610") != std::string::npos)
     {
         return arm_compute::GPUTarget::G610;
     }
-    else if(version.find("G510") != std::string::npos)
+    else if (version.find("G510") != std::string::npos)
     {
         return arm_compute::GPUTarget::G510;
     }
-    else if(version.find("G310") != std::string::npos)
+    else if (version.find("G310") != std::string::npos)
     {
         return arm_compute::GPUTarget::G310;
     }
-    else if(version.find("G715") != std::string::npos)
+    else if (version.find("G715") != std::string::npos)
     {
         return arm_compute::GPUTarget::G715;
     }
-    else if(version.find("G615") != std::string::npos)
+    else if (version.find("G615") != std::string::npos)
     {
         return arm_compute::GPUTarget::G615;
     }
@@ -83,39 +84,39 @@
 
 arm_compute::GPUTarget get_bifrost_target(const std::string &version)
 {
-    if(version.find("G71") != std::string::npos)
+    if (version.find("G71") != std::string::npos)
     {
         return arm_compute::GPUTarget::G71;
     }
-    else if(version.find("G72") != std::string::npos)
+    else if (version.find("G72") != std::string::npos)
     {
         return arm_compute::GPUTarget::G72;
     }
-    else if(version.find("G51BIG") != std::string::npos)
+    else if (version.find("G51BIG") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51BIG;
     }
-    else if(version.find("G51LIT") != std::string::npos)
+    else if (version.find("G51LIT") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51LIT;
     }
-    else if(version.find("G51") != std::string::npos)
+    else if (version.find("G51") != std::string::npos)
     {
         return arm_compute::GPUTarget::G51;
     }
-    else if(version.find("G52LIT") != std::string::npos)
+    else if (version.find("G52LIT") != std::string::npos)
     {
         return arm_compute::GPUTarget::G52LIT;
     }
-    else if(version.find("G52") != std::string::npos)
+    else if (version.find("G52") != std::string::npos)
     {
         return arm_compute::GPUTarget::G52;
     }
-    else if(version.find("G76") != std::string::npos)
+    else if (version.find("G76") != std::string::npos)
     {
         return arm_compute::GPUTarget::G76;
     }
-    else if(version.find("G31") != std::string::npos)
+    else if (version.find("G31") != std::string::npos)
     {
         return arm_compute::GPUTarget::G31;
     }
@@ -127,15 +128,15 @@
 
 arm_compute::GPUTarget get_midgard_target(const std::string &version)
 {
-    if(version.find("T600") != std::string::npos)
+    if (version.find("T600") != std::string::npos)
     {
         return arm_compute::GPUTarget::T600;
     }
-    else if(version.find("T700") != std::string::npos)
+    else if (version.find("T700") != std::string::npos)
     {
         return arm_compute::GPUTarget::T700;
     }
-    else if(version.find("T800") != std::string::npos)
+    else if (version.find("T800") != std::string::npos)
     {
         return arm_compute::GPUTarget::T800;
     }
@@ -150,34 +151,16 @@
 {
 const std::string &string_from_target(GPUTarget target)
 {
-    static std::map<GPUTarget, const std::string> gpu_target_map =
-    {
-        { GPUTarget::MIDGARD, "midgard" },
-        { GPUTarget::BIFROST, "bifrost" },
-        { GPUTarget::VALHALL, "valhall" },
-        { GPUTarget::T600, "t600" },
-        { GPUTarget::T700, "t700" },
-        { GPUTarget::T800, "t800" },
-        { GPUTarget::G71, "g71" },
-        { GPUTarget::G72, "g72" },
-        { GPUTarget::G51, "g51" },
-        { GPUTarget::G51BIG, "g51big" },
-        { GPUTarget::G51LIT, "g51lit" },
-        { GPUTarget::G31, "g31" },
-        { GPUTarget::G76, "g76" },
-        { GPUTarget::G52, "g52" },
-        { GPUTarget::G52LIT, "g52lit" },
-        { GPUTarget::G77, "g77" },
-        { GPUTarget::G57, "g57" },
-        { GPUTarget::G78, "g78" },
-        { GPUTarget::G68, "g68" },
-        { GPUTarget::G78AE, "g78ae" },
-        { GPUTarget::G710, "g710" },
-        { GPUTarget::G610, "g610" },
-        { GPUTarget::G510, "g510" },
-        { GPUTarget::G310, "g310" },
-        { GPUTarget::G715, "g715" },
-        { GPUTarget::G615, "g615" },
+    static std::map<GPUTarget, const std::string> gpu_target_map = {
+        {GPUTarget::MIDGARD, "midgard"}, {GPUTarget::BIFROST, "bifrost"}, {GPUTarget::VALHALL, "valhall"},
+        {GPUTarget::T600, "t600"},       {GPUTarget::T700, "t700"},       {GPUTarget::T800, "t800"},
+        {GPUTarget::G71, "g71"},         {GPUTarget::G72, "g72"},         {GPUTarget::G51, "g51"},
+        {GPUTarget::G51BIG, "g51big"},   {GPUTarget::G51LIT, "g51lit"},   {GPUTarget::G31, "g31"},
+        {GPUTarget::G76, "g76"},         {GPUTarget::G52, "g52"},         {GPUTarget::G52LIT, "g52lit"},
+        {GPUTarget::G77, "g77"},         {GPUTarget::G57, "g57"},         {GPUTarget::G78, "g78"},
+        {GPUTarget::G68, "g68"},         {GPUTarget::G78AE, "g78ae"},     {GPUTarget::G710, "g710"},
+        {GPUTarget::G610, "g610"},       {GPUTarget::G510, "g510"},       {GPUTarget::G310, "g310"},
+        {GPUTarget::G715, "g715"},       {GPUTarget::G615, "g615"},
     };
 
     return gpu_target_map[target];
@@ -189,7 +172,7 @@
     std::smatch name_parts;
     const bool  found_mali = std::regex_search(device_name, name_parts, mali_regex);
 
-    if(!found_mali)
+    if (!found_mali)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Can't find valid Arm® Mali™ GPU. Target is set to default.");
         return GPUTarget::MIDGARD;
@@ -203,22 +186,22 @@
 
     // Work-out gpu target
     GPUTarget gpu_target;
-    if(target == 'G' || is_future_gpu)
+    if (target == 'G' || is_future_gpu)
     {
         // Check for Valhall or Bifrost
         gpu_target = get_valhall_target(version);
-        if(gpu_target == GPUTarget::UNKNOWN)
+        if (gpu_target == GPUTarget::UNKNOWN)
         {
             gpu_target = get_bifrost_target(version);
         }
 
         // Default GPUTarget
-        if(gpu_target == GPUTarget::UNKNOWN)
+        if (gpu_target == GPUTarget::UNKNOWN)
         {
             gpu_target = GPUTarget::VALHALL;
         }
     }
-    else if(target == 'T')
+    else if (target == 'T')
     {
         gpu_target = get_midgard_target(version);
     }
@@ -228,7 +211,7 @@
     }
 
     // Report in case of unknown target
-    if(gpu_target == GPUTarget::UNKNOWN)
+    if (gpu_target == GPUTarget::UNKNOWN)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Arm® Mali™ Mali GPU unknown. Target is set to the default one. (BIFROST)");
         return GPUTarget::BIFROST;

diff --git a/src/core/Helpers.cpp b/src/core/Helpers.cpp
index 28e7f4c..c801b09 100644
--- a/src/core/Helpers.cpp
+++ b/src/core/Helpers.cpp

@@ -25,8 +25,11 @@
 
 namespace arm_compute
 {
-ValidRegion calculate_valid_region_scale(const ITensorInfo &src_info, const TensorShape &dst_shape,
-                                         InterpolationPolicy interpolate_policy, SamplingPolicy sampling_policy, bool border_undefined)
+ValidRegion calculate_valid_region_scale(const ITensorInfo  &src_info,
+                                         const TensorShape  &dst_shape,
+                                         InterpolationPolicy interpolate_policy,
+                                         SamplingPolicy      sampling_policy,
+                                         bool                border_undefined)
 {
     const DataLayout data_layout = src_info.data_layout();
     const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -49,9 +52,9 @@
     auto valid_end_out_y   = std::min<int>(std::ceil(valid_end_in_y * scale_y), dst_shape[idx_height]);
 
     // Handle valid points in case of the bi-linear interpolation
-    if(border_undefined)
+    if (border_undefined)
     {
-        switch(interpolate_policy)
+        switch (interpolate_policy)
         {
             case InterpolationPolicy::NEAREST_NEIGHBOR:
             {
@@ -90,7 +93,7 @@
     }
 
     // Setup output valid region
-    ValidRegion valid_region{ Coordinates(), dst_shape, dst_shape.num_dimensions() };
+    ValidRegion valid_region{Coordinates(), dst_shape, dst_shape.num_dimensions()};
 
     valid_region.anchor.set(idx_width, std::max(0, valid_start_out_x));
     valid_region.anchor.set(idx_height, std::max(0, valid_start_out_y));
@@ -109,14 +112,12 @@
     constexpr DataLayoutDimension D = DataLayoutDimension::DEPTH;
     constexpr DataLayoutDimension N = DataLayoutDimension::BATCHES;
 
-    static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map =
-    {
-        { DataLayout::NDHWC, { C, W, H, D, N } },
-        { DataLayout::NCDHW, { W, H, D, C, N } },
-        { DataLayout::NHWC, { C, W, H, N } },
-        { DataLayout::NCHW, { W, H, C, N } }
-    };
+    static const std::map<DataLayout, std::vector<DataLayoutDimension>> layout_map = {
+        {DataLayout::NDHWC, {C, W, H, D, N}},
+        {DataLayout::NCDHW, {W, H, D, C, N}},
+        {DataLayout::NHWC, {C, W, H, N}},
+        {DataLayout::NCHW, {W, H, C, N}}};
 
     return layout_map;
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/IAccessWindow.cpp b/src/core/IAccessWindow.cpp
index 8328012..923c5f8 100644
--- a/src/core/IAccessWindow.cpp
+++ b/src/core/IAccessWindow.cpp

@@ -29,14 +29,18 @@
 
 using namespace arm_compute;
 
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, const ValidRegion &input_valid_region) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window      &window,
+                                                        const ValidRegion &input_valid_region) const
 {
     return compute_valid_region(window, input_valid_region, false, BorderSize(0));
 }
 
-ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window, ValidRegion input_valid_region, bool border_undefined, BorderSize border_size) const
+ValidRegion AccessWindowRectangle::compute_valid_region(const Window &window,
+                                                        ValidRegion   input_valid_region,
+                                                        bool          border_undefined,
+                                                        BorderSize    border_size) const
 {
-    if(_info == nullptr)
+    if (_info == nullptr)
     {
         return input_valid_region;
     }
@@ -45,7 +49,7 @@
     Coordinates  old_anchor(anchor);
     TensorShape &shape = input_valid_region.shape;
 
-    if(!border_undefined)
+    if (!border_undefined)
     {
         border_size = BorderSize(0);
     }
@@ -56,7 +60,7 @@
     // Additionally the valid region is shifted by the offset that is used by
     // the kernel to write back output values.
     anchor.set(0, std::max<int>(window.x().start() * _scale_x, anchor[0] + border_size.left) + _x);
-    if(_info->num_dimensions() > 1)
+    if (_info->num_dimensions() > 1)
     {
         anchor.set(1, std::max<int>(window.y().start() * _scale_y, anchor[1] + border_size.top) + _y);
     }
@@ -69,15 +73,19 @@
     // old size is first converted into end points to compared against the
     // execution window. Afterwards the new end points are converted back into
     // a size of the region.
-    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right, (window.x().end() - window.x().step()) * _scale_x + _width) - anchor[0]);
-    if(_info->num_dimensions() > 1)
+    shape.set(0, std::min<int>(old_anchor[0] + shape[0] - border_size.right,
+                               (window.x().end() - window.x().step()) * _scale_x + _width) -
+                     anchor[0]);
+    if (_info->num_dimensions() > 1)
     {
-        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom, (window.y().end() - window.y().step()) * _scale_y + _height) - anchor[1]);
+        shape.set(1, std::min<int>(old_anchor[1] + shape[1] - border_size.bottom,
+                                   (window.y().end() - window.y().step()) * _scale_y + _height) -
+                         anchor[1]);
     }
 
     // For higher dimensions use the intersection of the window size and the
     // valid region of the input
-    for(size_t d = 2; d < _info->num_dimensions(); ++d)
+    for (size_t d = 2; d < _info->num_dimensions(); ++d)
     {
         anchor.set(d, std::max(window[d].start(), input_valid_region.anchor[d]));
         shape.set(d, std::min<int>(window[d].end(), input_valid_region.shape[d]) - anchor[d]);
@@ -86,9 +94,12 @@
     return input_valid_region;
 }
 
-void AccessWindowRectangle::set_valid_region(const Window &window, const ValidRegion &input_valid_region, bool border_undefined, const BorderSize &border_size)
+void AccessWindowRectangle::set_valid_region(const Window      &window,
+                                             const ValidRegion &input_valid_region,
+                                             bool               border_undefined,
+                                             const BorderSize  &border_size)
 {
-    if(_info != nullptr)
+    if (_info != nullptr)
     {
         _info->set_valid_region(compute_valid_region(window, input_valid_region, border_undefined, border_size));
     }
@@ -97,17 +108,16 @@
 bool AccessWindowRectangle::update_window_if_needed(Window &window) const
 {
     // Only update the window size if we can't use padding
-    if(_info == nullptr || _info->is_resizable())
+    if (_info == nullptr || _info->is_resizable())
     {
         return false;
     }
 
-    PaddingSize needed = get_needed_padding(window);
+    PaddingSize needed    = get_needed_padding(window);
     PaddingSize available = _info->padding();
 
-    if(needed.top <= available.top && needed.right <= available.right
-    && needed.bottom <= available.bottom
-    && needed.left <= available.left)
+    if (needed.top <= available.top && needed.right <= available.right && needed.bottom <= available.bottom &&
+        needed.left <= available.left)
     {
         return false;
     }
@@ -124,12 +134,12 @@
     const int max_y = (window.y().end() - window.y().step()) * _scale_y + _y + _height;
 
     // Adjust window start for Y dimension
-    if(min_y < 0)
+    if (min_y < 0)
     {
         // Calculate rows available above the tensor
         const int front_pad_y_available = -static_cast<int>(offset_first_element / strides[1]);
 
-        if(min_y < front_pad_y_available)
+        if (min_y < front_pad_y_available)
         {
             // Not enough padding available, need to shrink the window
             int start = adjust_up(min_y, front_pad_y_available, window.y().step() * _scale_y) - _y;
@@ -144,18 +154,19 @@
     }
 
     // Adjust window end for Y dimension
-    if(max_y > static_cast<int>(shape[1]))
+    if (max_y > static_cast<int>(shape[1]))
     {
         const int stride_z = _info->num_dimensions() > 2 ? strides[2] : _info->total_size();
 
         // Calculate rows available below the tensor
         const int tail_pad_y_available = (stride_z / strides[1]) - shape[1] - front_pad_y;
 
-        if(static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
+        if (static_cast<int>(shape[1]) + tail_pad_y_available < max_y)
         {
             // Not enough padding available, need to shrink the window
-            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) + window.y().step() * _scale_y - _y - _height;
-            end     = std::max<int>(window.y().start(), end / _scale_y);
+            int end = adjust_down(max_y, shape[1] + tail_pad_y_available, window.y().step() * _scale_y) +
+                      window.y().step() * _scale_y - _y - _height;
+            end = std::max<int>(window.y().start(), end / _scale_y);
 
             window.set(1, Window::Dimension(window.y().start(), end, window.y().step()));
             window_modified = true;
@@ -170,11 +181,14 @@
     const int stride_y = _info->num_dimensions() > 1 ? strides[1] : _info->total_size();
 
     // Adjust window start for X dimension
-    if(min_x < 0)
+    if (min_x < 0)
     {
-        const int front_pad_x_available = -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1], stride_y - shape[0] * strides[0]) / static_cast<int>(strides[0]);
+        const int front_pad_x_available =
+            -std::min<int>(static_cast<int>(offset_first_element) - front_pad_y * strides[1],
+                           stride_y - shape[0] * strides[0]) /
+            static_cast<int>(strides[0]);
 
-        if(min_x < front_pad_x_available)
+        if (min_x < front_pad_x_available)
         {
             // Not enough padding available, need to shrink the window
             int start = adjust_up(min_x, front_pad_x_available, window.x().step() * _scale_x) - _x;
@@ -189,15 +203,16 @@
     }
 
     // Adjust window end for X dimension
-    if(max_x > static_cast<int>(shape[0]))
+    if (max_x > static_cast<int>(shape[0]))
     {
         const int tail_pad_x_available = (stride_y / strides[0]) - shape[0] - front_pad_x;
 
-        if(static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
+        if (static_cast<int>(shape[0]) + tail_pad_x_available < max_x)
         {
             // Not enough padding available, need to shrink the window
-            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) + window.x().step() * _scale_x - _x - _width;
-            end     = std::max<int>(window.x().start(), end / _scale_x);
+            int end = adjust_down(max_x, shape[0] + tail_pad_x_available, window.x().step() * _scale_x) +
+                      window.x().step() * _scale_x - _x - _width;
+            end = std::max<int>(window.x().start(), end / _scale_x);
 
             window.set(0, Window::Dimension(window.x().start(), end, window.x().step()));
             window_modified = true;
@@ -212,15 +227,15 @@
 bool AccessWindowRectangle::update_padding_if_needed(const Window &window)
 {
     // Only update the padding if the tensor allows it
-    if(_info == nullptr || !_info->is_resizable())
+    if (_info == nullptr || !_info->is_resizable())
     {
         return false;
     }
     // Update strides in tensor info
-    return _info->extend_padding( get_needed_padding(window));
+    return _info->extend_padding(get_needed_padding(window));
 }
 
-PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window)const
+PaddingSize AccessWindowRectangle::get_needed_padding(const Window &window) const
 {
     ARM_COMPUTE_ERROR_ON(_scale_x == 0);
     ARM_COMPUTE_ERROR_ON(_scale_y == 0);

diff --git a/src/core/IKernel.cpp b/src/core/IKernel.cpp
index 31f1ec7..fb7e095 100644
--- a/src/core/IKernel.cpp
+++ b/src/core/IKernel.cpp

@@ -30,8 +30,7 @@
     return _window;
 }
 
-IKernel::IKernel()
-    : _window()
+IKernel::IKernel() : _window()
 {
     // Create an empty window to make sure the children classes set the window values themselves
     _window.set(Window::DimX, Window::Dimension(0, 0, 1));

diff --git a/src/core/ITensor.cpp b/src/core/ITensor.cpp
index 2f4354c..4dc8ea9 100644
--- a/src/core/ITensor.cpp
+++ b/src/core/ITensor.cpp

@@ -35,7 +35,7 @@
 {
 void ITensor::copy_from(const ITensor &src)
 {
-    if(&src == this)
+    if (&src == this)
     {
         return;
     }
@@ -47,7 +47,7 @@
     ARM_COMPUTE_ERROR_ON(src_info->num_channels() != dst_info->num_channels());
     ARM_COMPUTE_ERROR_ON(src_info->element_size() != dst_info->element_size());
 
-    for(size_t d = 0; d < src_info->num_dimensions(); d++)
+    for (size_t d = 0; d < src_info->num_dimensions(); d++)
     {
         ARM_COMPUTE_ERROR_ON(src_info->dimension(d) > dst_info->dimension(d));
     }
@@ -66,11 +66,7 @@
     const size_t line_size = src_info->element_size() * src_info->dimension(0);
 
     execute_window_loop(
-        win_src, [&](const Coordinates &)
-    {
-        memcpy(dst_it.ptr(), src_it.ptr(), line_size);
-    },
-    src_it, dst_it);
+        win_src, [&](const Coordinates &) { memcpy(dst_it.ptr(), src_it.ptr(), line_size); }, src_it, dst_it);
 }
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
@@ -87,10 +83,10 @@
     stream_status.copyfmt(s);
 
     // Set precision
-    if(is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
+    if (is_data_type_float(dt) && (io_fmt.precision_type != IOFormatInfo::PrecisionType::Default))
     {
         int precision = io_fmt.precision;
-        if(io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
+        if (io_fmt.precision_type == IOFormatInfo::PrecisionType::Full)
         {
             precision = std::numeric_limits<float>().max_digits10;
         }
@@ -101,7 +97,7 @@
     size_t print_width  = 0;
     size_t print_height = 0;
     int    start_offset = 0;
-    switch(io_fmt.print_region)
+    switch (io_fmt.print_region)
     {
         case IOFormatInfo::PrintRegion::NoPadding:
             print_width  = this->info()->dimension(0);
@@ -111,13 +107,14 @@
         case IOFormatInfo::PrintRegion::ValidRegion:
             print_width  = this->info()->valid_region().shape.x();
             print_height = this->info()->valid_region().shape.y();
-            start_offset = this->info()->offset_element_in_bytes(Coordinates(this->info()->valid_region().anchor.x(),
-                                                                             this->info()->valid_region().anchor.y()));
+            start_offset = this->info()->offset_element_in_bytes(
+                Coordinates(this->info()->valid_region().anchor.x(), this->info()->valid_region().anchor.y()));
             break;
         case IOFormatInfo::PrintRegion::Full:
             print_width  = padding.left + this->info()->dimension(0) + padding.right;
             print_height = padding.top + this->info()->dimension(1) + padding.bottom;
-            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] - padding.left * strides[0];
+            start_offset = static_cast<int>(this->info()->offset_first_element_in_bytes()) - padding.top * strides[1] -
+                           padding.left * strides[0];
             break;
         default:
             break;
@@ -129,16 +126,17 @@
     const uint8_t *ptr = this->buffer() + start_offset;
 
     // Start printing
-    for(size_t i = 0; i < slices2D; ++i)
+    for (size_t i = 0; i < slices2D; ++i)
     {
         // Find max_width of elements in slice to align columns
         int max_element_width = 0;
-        if(io_fmt.align_columns)
+        if (io_fmt.align_columns)
         {
             size_t offset = i * strides[2];
-            for(size_t h = 0; h < print_height; ++h)
+            for (size_t h = 0; h < print_height; ++h)
             {
-                max_element_width = std::max<int>(max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
+                max_element_width = std::max<int>(
+                    max_element_width, max_consecutive_elements_display_width(s, dt, ptr + offset, print_width));
                 offset += strides[1];
             }
         }
@@ -146,7 +144,7 @@
         // Print slice
         {
             size_t offset = i * strides[2];
-            for(size_t h = 0; h < print_height; ++h)
+            for (size_t h = 0; h < print_height; ++h)
             {
                 print_consecutive_elements(s, dt, ptr + offset, print_width, max_element_width, io_fmt.element_delim);
                 offset += strides[1];

diff --git a/src/core/ITensorPack.cpp b/src/core/ITensorPack.cpp
index 90f9a45..0f8b082 100644
--- a/src/core/ITensorPack.cpp
+++ b/src/core/ITensorPack.cpp

@@ -27,10 +27,9 @@
 
 namespace arm_compute
 {
-ITensorPack::ITensorPack(std::initializer_list<PackElement> l)
-    : _pack()
+ITensorPack::ITensorPack(std::initializer_list<PackElement> l) : _pack()
 {
-    for(auto &e : l)
+    for (auto &e : l)
     {
         _pack[e.id] = e;
     }
@@ -54,7 +53,7 @@
 const ITensor *ITensorPack::get_const_tensor(int id) const
 {
     auto it = _pack.find(id);
-    if(it != _pack.end())
+    if (it != _pack.end())
     {
         return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
     }
@@ -81,4 +80,4 @@
 {
     return _pack.empty();
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/NEON/NEAsymm.h b/src/core/NEON/NEAsymm.h
index e6d0e53..5f4d08d 100644
--- a/src/core/NEON/NEAsymm.h
+++ b/src/core/NEON/NEAsymm.h

@@ -26,6 +26,7 @@
 
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -90,7 +91,7 @@
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -130,18 +131,13 @@
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to U8
     uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = vmaxq_u8(out_u8, min_u8);
         out_u8 = vminq_u8(out_u8, max_u8);
@@ -170,7 +166,7 @@
                                        int8x16_t    max_s8,
                                        bool         is_bounded_relu)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << (-result_shift)));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << (-result_shift)));
@@ -204,18 +200,13 @@
     in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -247,8 +238,7 @@
     const static int32x4_t one_s32 = vdupq_n_s32(1);
 
     // Fixed point multiplication with vector saturating rounding doubling multiply high with scalar
-    int32x4x4_t res_shift_gt0 =
-    {
+    int32x4x4_t res_shift_gt0 = {
         vqrdmulhq_s32(in_s32.val[0], result_fixedpoint_multiplier.val[0]),
         vqrdmulhq_s32(in_s32.val[1], result_fixedpoint_multiplier.val[1]),
         vqrdmulhq_s32(in_s32.val[2], result_fixedpoint_multiplier.val[2]),
@@ -260,8 +250,7 @@
     res_shift_gt0.val[2] = rounding_divide_by_pow2(res_shift_gt0.val[2], result_shift.val[2]);
     res_shift_gt0.val[3] = rounding_divide_by_pow2(res_shift_gt0.val[3], result_shift.val[3]);
 
-    int32x4x4_t res_shift_lt0 =
-    {
+    int32x4x4_t res_shift_lt0 = {
         vmulq_s32(in_s32.val[0], vshlq_s32(one_s32, vnegq_s32(result_shift.val[0]))),
         vmulq_s32(in_s32.val[1], vshlq_s32(one_s32, vnegq_s32(result_shift.val[1]))),
         vmulq_s32(in_s32.val[2], vshlq_s32(one_s32, vnegq_s32(result_shift.val[2]))),
@@ -273,8 +262,7 @@
     res_shift_lt0.val[3] = vqrdmulhq_s32(res_shift_lt0.val[3], result_fixedpoint_multiplier.val[3]);
 
     // Select result depending on shift value
-    const uint32x4x4_t mask_lt0 =
-    {
+    const uint32x4x4_t mask_lt0 = {
 #ifdef __aarch64__
         vcltzq_s32(result_shift.val[0]),
         vcltzq_s32(result_shift.val[1]),
@@ -300,18 +288,13 @@
     in_s32.val[3] = vaddq_s32(in_s32.val[3], result_offset_after_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -332,15 +315,20 @@
  *
  * @return Quantized value
  */
-inline uint8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                     int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                     uint8_t min_u8, uint8_t max_u8, bool is_bounded_relu)
+inline uint8_t finalize_quantization(int32_t in_value,
+                                     int     result_fixedpoint_multiplier,
+                                     int32_t result_shift,
+                                     int32_t result_offset_after_shift_s32,
+                                     uint8_t min_u8,
+                                     uint8_t max_u8,
+                                     bool    is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+        in_value = vgetq_lane_s32(
+            vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
     }
     else
     {
@@ -355,7 +343,7 @@
 
     // Bound the result
     uint8_t out_u8 = static_cast<uint8_t>(std::max<int32_t>(0, std::min<int32_t>(255, in_value)));
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = static_cast<uint8_t>(std::max(min_u8, std::min(max_u8, out_u8)));
     }
@@ -375,15 +363,20 @@
  *
  * @return Quantized value
  */
-inline int8_t finalize_quantization(int32_t in_value, int result_fixedpoint_multiplier,
-                                    int32_t result_shift, int32_t result_offset_after_shift_s32,
-                                    int8_t min_s8, int8_t max_s8, bool is_bounded_relu)
+inline int8_t finalize_quantization(int32_t in_value,
+                                    int     result_fixedpoint_multiplier,
+                                    int32_t result_shift,
+                                    int32_t result_offset_after_shift_s32,
+                                    int8_t  min_s8,
+                                    int8_t  max_s8,
+                                    bool    is_bounded_relu)
 {
     int32x4_t in_s32 = vdupq_n_s32(in_value);
 
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        in_value = vgetq_lane_s32(vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
+        in_value = vgetq_lane_s32(
+            vqrdmulhq_n_s32(vmulq_n_s32(in_s32, (1 << (-result_shift))), result_fixedpoint_multiplier), 0);
     }
     else
     {
@@ -399,7 +392,7 @@
 
     // Bound the result
     int8_t out_s8 = static_cast<int8_t>(std::max<int32_t>(-128, std::min<int32_t>(127, in_value)));
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = static_cast<int8_t>(std::max(min_s8, std::min(max_s8, out_s8)));
     }
@@ -416,17 +409,16 @@
  */
 inline float32x4x2_t vdequantize(const uint8x8_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(qv)))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(qv)))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -439,17 +431,14 @@
  */
 inline float32x4x2_t vdequantize(const int8x8_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(qv))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(qv))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -462,19 +451,24 @@
  */
 inline float32x4x4_t vdequantize(const uint8x16_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -487,19 +481,16 @@
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale   = qi.scale;
-    const int           offset  = qi.offset;
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const int           offset             = qi.offset;
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -513,17 +504,22 @@
  */
 inline float32x4x4_t vdequantize(const uint8x16_t &qv, float scale, int32_t offset)
 {
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)), vscale),
-        }
-    };
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(qv))))), voffset)),
+                  vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -537,17 +533,14 @@
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale, int32_t offset)
 {
-    const int32x4_t     voffset = vdupq_n_s32(offset);
-    const float32x4_t   vscale  = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
-        }
-    };
+    const int32x4_t     voffset            = vdupq_n_s32(offset);
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv)))), voffset)), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -560,15 +553,12 @@
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, const float32x4x4_t vscale)
 {
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
-        }
-    };
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[0]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale.val[1]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[2]),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale.val[3]),
+    }};
     return vdequantized_input;
 }
 
@@ -581,16 +571,13 @@
  */
 inline float32x4x4_t vdequantize(const int8x16_t &qv, float scale)
 {
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
-        }
-    };
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(qv))))), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -607,18 +594,15 @@
     const int         offset    = qi.offset;
     const float32x4_t voffset   = vdupq_n_f32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #endif //__aarch64__
-        }
-    };
+    }};
     return vqmovun_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
 }
 
@@ -635,18 +619,15 @@
     const int         offset    = qi.offset;
     const float32x4_t voffset   = vdupq_n_f32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmlaq_f32(voffset, qv.val[1], vinvscale)),
 #endif //__aarch64__
-        }
-    };
+    }};
     return vqmovn_s16(vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])));
 }
 
@@ -654,22 +635,19 @@
 {
     const int32x4_t   voffset   = vdupq_n_s32(offset);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
-            vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+        vaddq_s32(vcvtaq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
 #else  //__aarch64__
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
-            vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)), voffset),
+        vaddq_s32(vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)), voffset),
 #endif //__aarch64__
-        }
-    };
+    }};
     return rf;
 }
 
@@ -715,7 +693,7 @@
     auto             rf = vquantize_internal(qv, qi.scale, qi.offset);
     const uint16x8_t pa = vcombine_u16(vqmovun_s32(rf.val[0]), vqmovun_s32(rf.val[1]));
     const uint16x8_t pb = vcombine_u16(vqmovun_s32(rf.val[2]), vqmovun_s32(rf.val[3]));
-    return { pa, pb };
+    return {pa, pb};
 }
 
 } // namespace arm_compute

diff --git a/src/core/NEON/NEAsymm.inl b/src/core/NEON/NEAsymm.inl
index ca2aea1..fd62fd4 100644
--- a/src/core/NEON/NEAsymm.inl
+++ b/src/core/NEON/NEAsymm.inl

@@ -51,14 +51,14 @@
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
     // Convert float32 vectors to uint32 vectors
 #if __aarch64__
-    if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
     {
         A_u32x4 = vcvtnq_u32_f32(A_f32x4);
         B_u32x4 = vcvtnq_u32_f32(B_f32x4);
         C_u32x4 = vcvtnq_u32_f32(C_f32x4);
         D_u32x4 = vcvtnq_u32_f32(D_f32x4);
     }
-    else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+    else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
     {
         A_u32x4 = vcvtaq_u32_f32(A_f32x4);
         B_u32x4 = vcvtaq_u32_f32(B_f32x4);
@@ -86,7 +86,7 @@
     return vcombine_u8(vqmovn_u16(vd_low_u16x8), vqmovn_u16(vd_high_u16x8));
 }
 
-template <RoundingPolicy   round_policy>
+template <RoundingPolicy round_policy>
 inline qasymm8x16_signed_t vmlaq_qasymm8_signed(qasymm8x16_signed_t vd, float32x4_t vs, float32x4_t vo)
 {
     // Convert uint8 vectors to int16 vectors
@@ -110,14 +110,14 @@
     C_f32x4 = vmlaq_f32(vo, C_f32x4, vs);
     D_f32x4 = vmlaq_f32(vo, D_f32x4, vs);
 #if __aarch64__
-    if(round_policy == RoundingPolicy::TO_NEAREST_EVEN)
+    if (round_policy == RoundingPolicy::TO_NEAREST_EVEN)
     {
         A_s32x4 = vcvtnq_s32_f32(A_f32x4);
         B_s32x4 = vcvtnq_s32_f32(B_f32x4);
         C_s32x4 = vcvtnq_s32_f32(C_f32x4);
         D_s32x4 = vcvtnq_s32_f32(D_f32x4);
     }
-    else if(round_policy == RoundingPolicy::TO_NEAREST_UP)
+    else if (round_policy == RoundingPolicy::TO_NEAREST_UP)
     {
         A_s32x4 = vcvtaq_s32_f32(A_f32x4);
         B_s32x4 = vcvtaq_s32_f32(B_f32x4);

diff --git a/src/core/NEON/NEFixedPoint.inl b/src/core/NEON/NEFixedPoint.inl
index 8bff9c4..fb403b6 100644
--- a/src/core/NEON/NEFixedPoint.inl
+++ b/src/core/NEON/NEFixedPoint.inl

@@ -30,13 +30,7 @@
 
 inline float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
 {
-    float32x4x2_t res =
-    {
-        {
-            vmaxq_f32(a.val[0], b.val[0]),
-            vmaxq_f32(a.val[1], b.val[1])
-        }
-    };
+    float32x4x2_t res = {{vmaxq_f32(a.val[0], b.val[0]), vmaxq_f32(a.val[1], b.val[1])}};
     return res;
 }
 #endif /* DOXYGEN_SKIP_THIS */

diff --git a/src/core/NEON/NEMath.inl b/src/core/NEON/NEMath.inl
index 1cbe669..f875917 100644
--- a/src/core/NEON/NEMath.inl
+++ b/src/core/NEON/NEMath.inl

@@ -29,19 +29,16 @@
 namespace arm_compute
 {
 /** Logarithm polynomial coefficients */
-const std::array<float32x4_t, 8> log_tab =
-{
-    {
-        vdupq_n_f32(-2.29561495781f),
-        vdupq_n_f32(-2.47071170807f),
-        vdupq_n_f32(-5.68692588806f),
-        vdupq_n_f32(-0.165253549814f),
-        vdupq_n_f32(5.17591238022f),
-        vdupq_n_f32(0.844007015228f),
-        vdupq_n_f32(4.58445882797f),
-        vdupq_n_f32(0.0141278216615f),
-    }
-};
+const std::array<float32x4_t, 8> log_tab = {{
+    vdupq_n_f32(-2.29561495781f),
+    vdupq_n_f32(-2.47071170807f),
+    vdupq_n_f32(-5.68692588806f),
+    vdupq_n_f32(-0.165253549814f),
+    vdupq_n_f32(5.17591238022f),
+    vdupq_n_f32(0.844007015228f),
+    vdupq_n_f32(4.58445882797f),
+    vdupq_n_f32(0.0141278216615f),
+}};
 
 /** Sin polynomial coefficients */
 constexpr float te_sin_coeff2 = 0.166666666666f; // 1/(2*3)
@@ -54,7 +51,7 @@
 {
 #if __ARM_FEATURE_FMA
     return vfmaq_f32(a, b, c);
-#else // __ARM_FEATURE_FMA
+#else  // __ARM_FEATURE_FMA
     return vmlaq_f32(a, b, c);
 #endif // __ARM_FEATURE_FMA
 }
@@ -73,13 +70,14 @@
 {
 #ifdef __aarch64__
     return vrndnq_f32(val);
-#else // __aarch64__
+#else  // __aarch64__
     static const float32x4_t CONST_HALF_FLOAT = vdupq_n_f32(0.5f);
     static const float32x4_t CONST_1_FLOAT    = vdupq_n_f32(1.f);
     static const int32x4_t   CONST_1_INT      = vdupq_n_s32(1);
     const float32x4_t        floor_val        = vfloorq_f32(val);
     const float32x4_t        diff             = vsubq_f32(val, floor_val);
-    const float32x4_t        fp32_upper_limit = vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U
+    const float32x4_t        fp32_upper_limit =
+        vreinterpretq_f32_u32(vdupq_n_u32(0x4B000000)); // 0x4B000000 = (23U + 127U) << 23U
 
     /*
     * 1. Select the floor value when (diff<0.5 || (diff==0.5 && floor_val%2==0).
@@ -95,12 +93,13 @@
     *    Threshold upper limit with format |S|E(8bits)|   Fraction(23bits)     | = (23 + 127) << 23 (assuming positive sign): Adding 127, because 127 represents the actual zero in this format.
     */
 
-    float32x4_t rounded_val = vbslq_f32(vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT),
-                                                  vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT),
-                                                            vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT),CONST_1_INT)))),
-                                        floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
+    float32x4_t rounded_val = vbslq_f32(
+        vorrq_u32(vcltq_f32(diff, CONST_HALF_FLOAT),
+                  vandq_u32(vceqq_f32(diff, CONST_HALF_FLOAT),
+                            vmvnq_u32(vtstq_s32(vandq_s32(vcvtq_s32_f32(floor_val), CONST_1_INT), CONST_1_INT)))),
+        floor_val, vaddq_f32(floor_val, CONST_1_FLOAT));
 
-    float32x4_t result      = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val);
+    float32x4_t result = vbslq_f32(vcgeq_f32(vabsq_f32(val), fp32_upper_limit), val, rounded_val);
 
     return result;
 #endif // __aarch64__
@@ -118,8 +117,8 @@
 inline float32x4_t vinvsqrtq_f32(float32x4_t x)
 {
     float32x4_t sqrt_reciprocal = vrsqrteq_f32(x);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
 
     return sqrt_reciprocal;
 }
@@ -152,8 +151,7 @@
     return res;
 }
 
-static const uint32_t exp_f32_coeff[] =
-{
+static const uint32_t exp_f32_coeff[] = {
     0x3f7ffff6, // x^1: 0x1.ffffecp-1f
     0x3efffedb, // x^2: 0x1.fffdb6p-2f
     0x3e2aaf33, // x^3: 0x1.555e66p-3f
@@ -169,10 +167,12 @@
     const auto c4 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[3]));
     const auto c5 = vreinterpretq_f32_u32(vdupq_n_u32(exp_f32_coeff[4]));
 
-    const auto shift      = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
-    const auto inv_ln2    = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
-    const auto neg_ln2_hi = vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
-    const auto neg_ln2_lo = vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+    const auto shift   = vreinterpretq_f32_u32(vdupq_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = vreinterpretq_f32_u32(vdupq_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi =
+        vreinterpretq_f32_u32(vdupq_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo =
+        vreinterpretq_f32_u32(vdupq_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
 
     const auto inf       = vdupq_n_f32(std::numeric_limits<float>::infinity());
     const auto max_input = vdupq_n_f32(88.37f); // Approximately ln(2^127.5)
@@ -224,9 +224,9 @@
 #ifdef __aarch64__
 inline float32x4_t verfq_f32(float32x4_t x)
 {
-    static const float       erffdata[4] = { 0.278393f, 0.230389f, 0.000972f, 0.078108f };
+    static const float       erffdata[4] = {0.278393f, 0.230389f, 0.000972f, 0.078108f};
     static const float32x4_t coeffdata   = vld1q_f32(erffdata);
-    static const float32x4_t onev{ vdupq_n_f32(1.0f) };
+    static const float32x4_t onev{vdupq_n_f32(1.0f)};
 
     uint32x4_t selector = vcltzq_f32(x);
 
@@ -287,10 +287,12 @@
 
     float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH);
     // x * (1 - x^2/3) if |x| < 5.e-3 or (exp2x - 1) / (exp2x + 1) otherwise
-    float32x4_t exp2x = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
-    float32x4_t num   = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
-    float32x4_t den   = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
-    float32x4_t tanh  = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
+    float32x4_t exp2x =
+        vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vexpq_f32(vmulq_f32(CONST_2, x)), vmulq_f32(x, x));
+    float32x4_t num =
+        vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vsubq_f32(exp2x, CONST_1), vmulq_f32(CONST_1_3, exp2x));
+    float32x4_t den = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vaddq_f32(exp2x, CONST_1), vsubq_f32(CONST_1, num));
+    float32x4_t tanh = vbslq_f32(vcgtq_f32(vabsq_f32(x), CONST_THR), vmulq_f32(num, vinvq_f32(den)), vmulq_f32(x, den));
     return tanh;
 }
 
@@ -456,30 +458,23 @@
 
 inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out)
 {
-    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
-    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
-    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])),
-                                         vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
+    out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), vqmovn_u32(vcvtq_u32_f32(in2.val[0]))));
+    out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), vqmovn_u32(vcvtq_u32_f32(in2.val[1]))));
+    out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), vqmovn_u32(vcvtq_u32_f32(in2.val[2]))));
 }
 
 inline void convert_float32x4x4_to_uint8x16(const float32x4x4_t &in, uint8x16_t &out)
 {
-    const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])),
-                                   vqmovn_u32(vcvtq_u32_f32(in.val[1])));
-    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])),
-                                   vqmovn_u32(vcvtq_u32_f32(in.val[3])));
-    out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
+    const auto low  = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), vqmovn_u32(vcvtq_u32_f32(in.val[1])));
+    const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), vqmovn_u32(vcvtq_u32_f32(in.val[3])));
+    out             = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high));
 }
 
 inline void convert_float32x4x4_to_int8x16(const float32x4x4_t &in, int8x16_t &out)
 {
-    const auto low = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])),
-                                   vqmovn_s32(vcvtq_s32_f32(in.val[1])));
-    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])),
-                                   vqmovn_s32(vcvtq_s32_f32(in.val[3])));
-    out = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
+    const auto low  = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[0])), vqmovn_s32(vcvtq_s32_f32(in.val[1])));
+    const auto high = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(in.val[2])), vqmovn_s32(vcvtq_s32_f32(in.val[3])));
+    out             = vcombine_s8(vqmovn_s16(low), vqmovn_s16(high));
 }
 
 template <>
@@ -552,8 +547,8 @@
 inline float16x8_t vinvsqrtq_f16(float16x8_t x)
 {
     float16x8_t sqrt_reciprocal = vrsqrteq_f16(x);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
-    sqrt_reciprocal             = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
+    sqrt_reciprocal = vmulq_f16(vrsqrtsq_f16(vmulq_f16(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal);
     return sqrt_reciprocal;
 }
 
@@ -602,8 +597,8 @@
 inline float16x8_t vtanhq_f16(float16x8_t x)
 {
     // Split into high/low and use rational approximation on both parts exactly
-    const float16x8_t tanh = vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)),
-                                          vtanh_rational_approx_f16(vget_high_f16(x)));
+    const float16x8_t tanh =
+        vcombine_f16(vtanh_rational_approx_f16(vget_low_f16(x)), vtanh_rational_approx_f16(vget_high_f16(x)));
 
     // tanh(x) == sign(x) to F16 precision for |x| >= 4.508, use sign after this
     const float16x8_t ONE      = vdupq_n_f16(1.0f);

diff --git a/src/core/NEON/NESymm.h b/src/core/NEON/NESymm.h
index e664457..ec246ef 100644
--- a/src/core/NEON/NESymm.h
+++ b/src/core/NEON/NESymm.h

@@ -25,7 +25,9 @@
 #define ARM_COMPUTE_NESYMM_H
 
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -49,13 +51,10 @@
  * @return Quantized values
  */
 template <bool is_bounded_relu>
-int16x8_t finalize_quantization_int16(int32x4x2_t &in_s32,
-                                      int          result_fixedpoint_multiplier,
-                                      int32_t      result_shift,
-                                      int16x8_t    min_s16,
-                                      int16x8_t    max_s16)
+int16x8_t finalize_quantization_int16(
+    int32x4x2_t &in_s32, int result_fixedpoint_multiplier, int32_t result_shift, int16x8_t min_s16, int16x8_t max_s16)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
         in_s32.val[0] = vmulq_n_s32(in_s32.val[0], (1 << -result_shift));
         in_s32.val[1] = vmulq_n_s32(in_s32.val[1], (1 << -result_shift));
@@ -76,7 +75,7 @@
     // Convert S32 to S16
     int16x8_t out_s16 = vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s16 = vmaxq_s16(out_s16, min_s16);
         out_s16 = vminq_s16(out_s16, max_s16);
@@ -98,13 +97,14 @@
  * @return Quantized values
  */
 template <bool is_bounded_relu>
-inline int16_t finalize_quantization_int16(int32_t in_value, int result_fixedpoint_multiplier,
-                                           int32_t result_shift, int16_t min_s16, int16_t max_s16)
+inline int16_t finalize_quantization_int16(
+    int32_t in_value, int result_fixedpoint_multiplier, int32_t result_shift, int16_t min_s16, int16_t max_s16)
 {
-    if(result_shift < 0)
+    if (result_shift < 0)
     {
-        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) * static_cast<int64_t>(result_fixedpoint_multiplier);
-        in_value            = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
+        const int64_t in_64 = static_cast<int64_t>(in_value) * (1 << (-result_shift)) *
+                              static_cast<int64_t>(result_fixedpoint_multiplier);
+        in_value = static_cast<int32_t>((in_64 + (1 << 30)) >> 31);
     }
     else
     {
@@ -117,7 +117,7 @@
     // Bound the result
     int16_t out_s16 = static_cast<int16_t>(std::max<int32_t>(-32768, std::min<int32_t>(32767, in_value)));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s16 = static_cast<int16_t>(std::max(min_s16, std::min(max_s16, out_s16)));
     }
@@ -134,14 +134,9 @@
  */
 inline float32x4x2_t vdequantize_int16(const int16x8_t &qv, float scale)
 {
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x2_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)
-        }
-    };
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x2_t vdequantized_input = {{vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv))), vscale),
+                                               vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv))), vscale)}};
     return vdequantized_input;
 }
 
@@ -156,18 +151,13 @@
 {
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
 
-    const int32x4x2_t rf =
-    {
-        {
+    const int32x4x2_t rf = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+        vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
 #else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
+        vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)), vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale))
 #endif //__aarch64__
-        }
-    };
+    }};
     return vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
 }
 
@@ -180,17 +170,14 @@
  */
 inline float32x4x4_t vdequantize(const int16x8x2_t &qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale  = qi.scale;
-    const float32x4_t   vscale = vdupq_n_f32(scale);
-    const float32x4x4_t vdequantized_input =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
-            vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
-        }
-    };
+    const float         scale              = qi.scale;
+    const float32x4_t   vscale             = vdupq_n_f32(scale);
+    const float32x4x4_t vdequantized_input = {{
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[0]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[0]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(qv.val[1]))), vscale),
+        vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(qv.val[1]))), vscale),
+    }};
     return vdequantized_input;
 }
 
@@ -206,24 +193,20 @@
     const float scale = qi.scale;
     ARM_COMPUTE_ERROR_ON(scale == 0.f);
     const float32x4_t vinvscale = vdupq_n_f32(1.f / scale);
-    const int32x4x4_t rf =
-    {
-        {
+    const int32x4x4_t rf        = {{
 #ifdef __aarch64__
-            vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+        vcvtnq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
 #else  //__aarch64__
-            vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
-            vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[0], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[1], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[2], vinvscale)),
+        vcvtq_s32_f32(vmulq_f32(qv.val[3], vinvscale)),
 #endif //__aarch64__
-        }
-    };
-    const qsymm16x8x2_t res =
-    {
+    }};
+    const qsymm16x8x2_t res = {
         vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1])),
         vcombine_s16(vqmovn_s32(rf.val[2]), vqmovn_s32(rf.val[3])),
     };

diff --git a/src/core/NEON/SVEAsymm.h b/src/core/NEON/SVEAsymm.h
index eea2627..a448cde 100644
--- a/src/core/NEON/SVEAsymm.h
+++ b/src/core/NEON/SVEAsymm.h

@@ -26,6 +26,7 @@
 
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -70,10 +71,18 @@
     const auto          voffset            = svdup_n_s32(offset);
     const auto          vscale             = svdup_n_f32(scale);
     const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)), vscale));
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(qv))), voffset)),
+                    vscale),
+        svmul_f32_z(pg,
+                    svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(qv))), voffset)),
+                    vscale));
     return vdequantized_input;
 }
 
@@ -104,10 +113,10 @@
     const auto          voffset            = svdup_n_s32(offset);
     const auto          vscale             = svdup_n_f32(scale);
     const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(qv)), voffset)), vscale),
+        svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(qv)), voffset)), vscale));
 
     return vdequantized_input;
 }
@@ -135,11 +144,11 @@
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, const svfloat32x4_t vscale)
 {
-    const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
+    const svfloat32x4_t vdequantized_input =
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), svget4_f32(vscale, 0)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), svget4_f32(vscale, 1)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), svget4_f32(vscale, 2)),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), svget4_f32(vscale, 3)));
 
     return vdequantized_input;
 }
@@ -153,12 +162,12 @@
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint8_t &qv, float scale)
 {
-    const auto          vscale             = svdup_n_f32(scale);
-    const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
+    const auto          vscale = svdup_n_f32(scale);
+    const svfloat32x4_t vdequantized_input =
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlb_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlb_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svmovlt_s16(qv))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svmovlt_s16(qv))), vscale));
     return vdequantized_input;
 }
 

diff --git a/src/core/NEON/SVEMath.h b/src/core/NEON/SVEMath.h
index 5ada7ae..6d69b33 100644
--- a/src/core/NEON/SVEMath.h
+++ b/src/core/NEON/SVEMath.h

@@ -28,6 +28,7 @@
 #include "src/core/NEON/wrapper/intrinsics/svcvt.h"
 #include "src/core/NEON/wrapper/intrinsics/svdup_n.h"
 #include "src/core/NEON/wrapper/intrinsics/svreinterpret.h"
+
 #include <arm_sve.h>
 #include <array>
 
@@ -181,9 +182,12 @@
  * @return The converted integer vector
  */
 template <typename int_vec_type>
-int_vec_type convert_float_to_int(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3);
+int_vec_type convert_float_to_int(const svfloat32_t &in_0,
+                                  const svfloat32_t &in_1,
+                                  const svfloat32_t &in_2,
+                                  const svfloat32_t &in_3);
 
 } // namespace arm_compute
 #include "src/core/NEON/SVEMath.inl"
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE) */
-#endif /* ARM_COMPUTE_SVEMATH_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_SVEMATH_H */

diff --git a/src/core/NEON/SVEMath.inl b/src/core/NEON/SVEMath.inl
index 8973d0b..b30125d 100644
--- a/src/core/NEON/SVEMath.inl
+++ b/src/core/NEON/SVEMath.inl

@@ -32,8 +32,16 @@
 
 namespace arm_compute
 {
-inline svfloat32_t svtaylor_poly_f32_z(svbool_t pg, svfloat32_t x, svfloat32_t coeff_1, svfloat32_t coeff_2, svfloat32_t coeff_3,
-                                       svfloat32_t coeff_4, svfloat32_t coeff_5, svfloat32_t coeff_6, svfloat32_t coeff_7, svfloat32_t coeff_8)
+inline svfloat32_t svtaylor_poly_f32_z(svbool_t    pg,
+                                       svfloat32_t x,
+                                       svfloat32_t coeff_1,
+                                       svfloat32_t coeff_2,
+                                       svfloat32_t coeff_3,
+                                       svfloat32_t coeff_4,
+                                       svfloat32_t coeff_5,
+                                       svfloat32_t coeff_6,
+                                       svfloat32_t coeff_7,
+                                       svfloat32_t coeff_8)
 {
     const auto A   = svmla_f32_z(pg, coeff_1, coeff_5, x);
     const auto B   = svmla_f32_z(pg, coeff_3, coeff_7, x);
@@ -45,8 +53,16 @@
     return res;
 }
 
-inline svfloat16_t svtaylor_poly_f16_z(svbool_t pg, svfloat16_t x, svfloat16_t coeff_1, svfloat16_t coeff_2, svfloat16_t coeff_3,
-                                       svfloat16_t coeff_4, svfloat16_t coeff_5, svfloat16_t coeff_6, svfloat16_t coeff_7, svfloat16_t coeff_8)
+inline svfloat16_t svtaylor_poly_f16_z(svbool_t    pg,
+                                       svfloat16_t x,
+                                       svfloat16_t coeff_1,
+                                       svfloat16_t coeff_2,
+                                       svfloat16_t coeff_3,
+                                       svfloat16_t coeff_4,
+                                       svfloat16_t coeff_5,
+                                       svfloat16_t coeff_6,
+                                       svfloat16_t coeff_7,
+                                       svfloat16_t coeff_8)
 {
     const auto A   = svmla_f16_z(pg, coeff_1, coeff_5, x);
     const auto B   = svmla_f16_z(pg, coeff_3, coeff_7, x);
@@ -90,15 +106,17 @@
     const auto c4 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[3]));
     const auto c5 = svreinterpret_f32_u32(svdup_n_u32(svexp_f32_coeff[4]));
 
-    const auto shift   = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f));  // 2^23 + 127 = 0x1.0000fep23f
-    const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b));  // 1 / ln(2) = 0x1.715476p+0f
-    const auto neg_ln2_hi  = svreinterpret_f32_u32(svdup_n_u32(0xbf317200));  // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
-    const auto neg_ln2_lo  = svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e));  // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
+    const auto shift   = svreinterpret_f32_u32(svdup_n_u32(0x4b00007f)); // 2^23 + 127 = 0x1.0000fep23f
+    const auto inv_ln2 = svreinterpret_f32_u32(svdup_n_u32(0x3fb8aa3b)); // 1 / ln(2) = 0x1.715476p+0f
+    const auto neg_ln2_hi =
+        svreinterpret_f32_u32(svdup_n_u32(0xbf317200)); // -ln(2) from bits  -1 to -19: -0x1.62e400p-1f
+    const auto neg_ln2_lo =
+        svreinterpret_f32_u32(svdup_n_u32(0xb5bfbe8e)); // -ln(2) from bits -20 to -42: -0x1.7f7d1cp-20f
 
     const auto inf       = svdup_n_f32(std::numeric_limits<float>::infinity());
-    const auto max_input = svdup_n_f32(88.37f);  // Approximately ln(2^127.5)
+    const auto max_input = svdup_n_f32(88.37f); // Approximately ln(2^127.5)
     const auto zero      = svdup_n_f32(0.f);
-    const auto min_input = svdup_n_f32(-86.64f);  // Approximately ln(2^-125)
+    const auto min_input = svdup_n_f32(-86.64f); // Approximately ln(2^-125)
 
     // Range reduction:
     //   e^x = 2^n * e^r
@@ -114,23 +132,23 @@
     //     (i.e. n) because the decimal part has been pushed out and lost.
     //   * The addition of 127 makes the FP32 fraction part of z ready to be used as the exponent
     //     in FP32 format. Left shifting z by 23 bits will result in 2^n.
-    const auto z = svmla_f32_z(pg, shift, x, inv_ln2);
-    const auto n = svsub_f32_z(pg, z, shift);
-    const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23));  // 2^n
+    const auto z     = svmla_f32_z(pg, shift, x, inv_ln2);
+    const auto n     = svsub_f32_z(pg, z, shift);
+    const auto scale = svreinterpret_f32_u32(svlsl_n_u32_z(pg, svreinterpret_u32_f32(z), 23)); // 2^n
 
     // The calculation of n * ln(2) is done using 2 steps to achieve accuracy beyond FP32.
     // This outperforms longer Taylor series (3-4 tabs) both in term of accuracy and performance.
     const auto r_hi = svmla_f32_z(pg, x, n, neg_ln2_hi);
-    const auto r = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
+    const auto r    = svmla_f32_z(pg, r_hi, n, neg_ln2_lo);
 
     // Compute the truncated Taylor series of e^r.
     //   poly = scale * (1 + c1 * r + c2 * r^2 + c3 * r^3 + c4 * r^4 + c5 * r^5)
     const auto r2 = svmul_f32_z(pg, r, r);
 
-    const auto p1 = svmul_f32_z(pg, c1, r);
-    const auto p23 = svmla_f32_z(pg, c2, c3, r);
-    const auto p45 = svmla_f32_z(pg, c4, c5, r);
-    const auto p2345 = svmla_f32_z(pg, p23, p45, r2);
+    const auto p1     = svmul_f32_z(pg, c1, r);
+    const auto p23    = svmla_f32_z(pg, c2, c3, r);
+    const auto p45    = svmla_f32_z(pg, c4, c5, r);
+    const auto p2345  = svmla_f32_z(pg, p23, p45, r2);
     const auto p12345 = svmla_f32_z(pg, p1, p2345, r2);
 
     auto poly = svmla_f32_z(pg, scale, p12345, scale);
@@ -213,7 +231,8 @@
     auto val = svreinterpret_f32_s32(svsub_s32_z(pg, svreinterpret_s32_f32(x), svlsl_n_s32_z(pg, m, 23)));
 
     // Polynomial Approximation
-    auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6, log_tab_7, log_tab_8);
+    auto poly = svtaylor_poly_f32_z(pg, val, log_tab_1, log_tab_2, log_tab_3, log_tab_4, log_tab_5, log_tab_6,
+                                    log_tab_7, log_tab_8);
 
     // Reconstruct
     poly = svmla_f32_z(pg, poly, svcvt_f32_s32_z(pg, m), CONST_LN2);
@@ -259,7 +278,8 @@
     //Find positive or negative
     const auto c_v    = svabs_z(pg, wrapper::svcvt_z<int32_t>(pg, svmul_z(pg, val, ipi_v)));
     const auto sign_v = svcmple(pg, val, wrapper::svdup_n(ScalarType(0)));
-    const auto odd_v  = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))), wrapper::svdup_n(IntType(0)));
+    const auto odd_v  = svcmpne(pg, svand_z(pg, wrapper::svreinterpret<IntType>(c_v), wrapper::svdup_n(IntType(1))),
+                                wrapper::svdup_n(IntType(0)));
 
     auto neg_v = sveor_z(pg, odd_v, sign_v);
 
@@ -347,7 +367,10 @@
 
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
 template <>
-inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svuint8_t convert_float_to_int<svuint8_t>(const svfloat32_t &in_0,
+                                                 const svfloat32_t &in_1,
+                                                 const svfloat32_t &in_2,
+                                                 const svfloat32_t &in_3)
 {
     svuint8_t  out;
     const auto all_true_pg = svptrue_b32();
@@ -381,7 +404,10 @@
 }
 
 template <>
-inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0, const svfloat32_t &in_1, const svfloat32_t &in_2, const svfloat32_t &in_3)
+inline svint8_t convert_float_to_int<svint8_t>(const svfloat32_t &in_0,
+                                               const svfloat32_t &in_1,
+                                               const svfloat32_t &in_2,
+                                               const svfloat32_t &in_3)
 {
     svint8_t   out;
     const auto all_true_pg = svptrue_b32();

diff --git a/src/core/NEON/SVESymm.h b/src/core/NEON/SVESymm.h
index 6808577..288d45d 100644
--- a/src/core/NEON/SVESymm.h
+++ b/src/core/NEON/SVESymm.h

@@ -28,6 +28,7 @@
 
 #if defined(ARM_COMPUTE_ENABLE_SVE2)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -42,8 +43,10 @@
  */
 inline svfloat32x2_t svdequantize_qsymm16_z(svbool_t pg, const svint16_t &qv, float scale)
 {
-    const auto          vscale             = svdup_n_f32(scale);
-    const svfloat32x2_t vdequantized_input = svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale), svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale));
+    const auto          vscale = svdup_n_f32(scale);
+    const svfloat32x2_t vdequantized_input =
+        svcreate2_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(qv)), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(qv)), vscale));
     return vdequantized_input;
 }
 
@@ -76,13 +79,13 @@
  */
 inline svfloat32x4_t svdequantize_z(svbool_t pg, const svint16x2_t qv, const UniformQuantizationInfo &qi)
 {
-    const float         scale              = qi.scale;
-    const auto          vscale             = svdup_n_f32(scale);
-    const svfloat32x4_t vdequantized_input = svcreate4_f32(
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
-                                                 svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale));
+    const float         scale  = qi.scale;
+    const auto          vscale = svdup_n_f32(scale);
+    const svfloat32x4_t vdequantized_input =
+        svcreate4_f32(svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 0))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 0))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(svget2_s16(qv, 1))), vscale),
+                      svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(svget2_s16(qv, 1))), vscale));
     return vdequantized_input;
 }
 
@@ -112,4 +115,4 @@
 
 } // namespace arm_compute
 #endif /* defined(ARM_COMPUTE_ENABLE_SVE2) */
-#endif // ARM_COMPUTE_NESYMM_H
\ No newline at end of file
+#endif // ARM_COMPUTE_NESYMM_H

diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
index 108b199..deb8999 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp

@@ -28,17 +28,16 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
 #include "src/core/NEON/kernels/batchnormalization/impl/list.h"
-#include "src/core/common/Registrars.h"
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <map>
 
@@ -52,8 +51,15 @@
     const CPUInfo &ci;
 };
 using BatchNormalizationSelectorPtr = std::add_pointer<bool(const BatchNormalizationSelectorData &data)>::type;
-using BatchNormalizationKernelPtr   = std::add_pointer<void(ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                            float, ActivationLayerInfo &, const Window &)>::type;
+using BatchNormalizationKernelPtr   = std::add_pointer<void(ITensor *,
+                                                          ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          const ITensor *,
+                                                          float,
+                                                          ActivationLayerInfo &,
+                                                          const Window &)>::type;
 
 struct BatchNormalizationKernel
 {
@@ -62,41 +68,32 @@
     BatchNormalizationKernelPtr         ukernel;
 };
 
-static const BatchNormalizationKernel available_kernels[] =
-{
+static const BatchNormalizationKernel available_kernels[] = {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
-    {
-        "sve_fp16_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)
-    },
-    {
-        "sve_fp32_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)
-    },
+    {"sve_fp16_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16 && data.ci.has_sve(); },
+     REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_batch_normalization)},
+    {"sve_fp32_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32 && data.ci.has_sve(); },
+     REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_batch_normalization)},
 #endif /* !defined(ARM_COMPUTE_ENABLE_SVE) */
 #if defined(ARM_COMPUTE_ENABLE_NEON)
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)
-    },
+    {"neon_fp16_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_batch_normalization)},
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-    {
-        "neon_fp32_batch_normalization",
-        [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)
-    },
+    {"neon_fp32_batch_normalization",
+     [](const BatchNormalizationSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_batch_normalization)},
 #endif /* !defined(ARM_COMPUTE_ENABLE_NEON) */
 };
 
 const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -104,25 +101,31 @@
     return nullptr;
 }
 
-Status
-validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
-                   const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
+Status validate_arguments(const ITensorInfo  *input,
+                          const ITensorInfo  *output,
+                          const ITensorInfo  *mean,
+                          const ITensorInfo  *var,
+                          const ITensorInfo  *beta,
+                          const ITensorInfo  *gamma,
+                          float               epsilon,
+                          ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
-    const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type(), CPUInfo::get() });
+    const auto *uk = get_implementation(BatchNormalizationSelectorData{input->data_type(), CPUInfo::get()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ActivationLayerInfo::ActivationFunction act = act_info.activation();
-        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
-                                    && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+        ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU &&
+                                    act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+                                    act !=
+                                        ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
         ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
     }
 
-    if(nullptr != output)
+    if (nullptr != output)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -131,17 +134,18 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, mean, var);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, var);
-    if(beta != nullptr)
+    if (beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, beta);
     }
-    if(gamma != nullptr)
+    if (gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mean, gamma);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(
+                                    input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
 
     return Status{};
 }
@@ -169,10 +173,12 @@
     // Only compute denominator and constants once per feature map.
     int slice = -1;
 
-    const auto input_mean  = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T mean        = static_cast<T>(0);
     T var         = static_cast<T>(0);
@@ -186,80 +192,83 @@
     auto       beta_vec        = wrapper::vdup_n(beta, ExactTagType{});
     auto       denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
     const auto epsilon_vec     = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
-    execute_window_loop(win_to_use, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        if(slice != id.z())
+    execute_window_loop(
+        win_to_use,
+        [&](const Coordinates &id)
         {
-            mean     = input_mean[id.z()];
-            var      = input_var[id.z()];
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-            var_vec  = wrapper::vdup_n(var, ExactTagType{});
-            if(input_gamma != nullptr)
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+
+            if (slice != id.z())
             {
-                gamma     = input_gamma[id.z()];
-                gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
-            }
-            if(input_beta != nullptr)
-            {
-                beta     = input_beta[id.z()];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                mean     = input_mean[id.z()];
+                var      = input_var[id.z()];
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+                var_vec  = wrapper::vdup_n(var, ExactTagType{});
+                if (input_gamma != nullptr)
+                {
+                    gamma     = input_gamma[id.z()];
+                    gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
+                }
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id.z()];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                // Calculate denominator
+                denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+                denominator     = wrapper::vgetlane(denominator_vec, 0);
+                slice           = id.z();
             }
 
-            // Calculate denominator
-            denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-            denominator     = wrapper::vgetlane(denominator_vec, 0);
-            slice           = id.z();
-        }
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(fused_activation)
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator_vec);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T numerator = input_ptr[x] - mean;
-            const T x_bar     = numerator * denominator;
-            T       res       = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(fused_activation)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
-            }
+                const T numerator = input_ptr[x] - mean;
+                const T x_bar     = numerator * denominator;
+                T       res       = beta + x_bar * gamma;
 
-            // Store results
-            *(output_ptr + x) = res;
-        }
-    },
-    input, output);
+                // Perform fused activation
+                if (fused_activation)
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *(output_ptr + x) = res;
+            }
+        },
+        input, output);
 }
 
 void NEBatchNormalizationLayerKernel::configure_non_fused()
 {
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
+            _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false,
+                                                                               detail::dummy<float16_t, 8>>;
             break;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F32:
@@ -274,23 +283,25 @@
 void NEBatchNormalizationLayerKernel::configure_fused()
 {
     // NCHW Fused Batched Normalization with activation functions : FP32
-    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
-    {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
-    };
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw = {
+        {ActivationLayerInfo::ActivationFunction::RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>>},
+        {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>>},
+        {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>>}};
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
     // NCHW Fused Batched Normalization with activation functions : FP16
-    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
-    {
-        { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
-        { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
-    };
+    static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw = {
+        {ActivationLayerInfo::ActivationFunction::RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>>},
+        {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>>},
+        {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+         &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>>}};
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
@@ -307,22 +318,32 @@
 }
 
 NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
+    : _func(nullptr),
+      _input(nullptr),
+      _output(nullptr),
+      _mean(nullptr),
+      _var(nullptr),
+      _gamma(nullptr),
+      _beta(nullptr),
+      _epsilon(),
+      _act_info()
 {
 }
 
-void NEBatchNormalizationLayerKernel::configure(ITensor *input, ITensor *output,
-                                                const ITensor *mean, const ITensor *var,
-                                                const ITensor *beta, const ITensor *gamma,
-                                                float epsilon, ActivationLayerInfo act_info)
+void NEBatchNormalizationLayerKernel::configure(ITensor            *input,
+                                                ITensor            *output,
+                                                const ITensor      *mean,
+                                                const ITensor      *var,
+                                                const ITensor      *beta,
+                                                const ITensor      *gamma,
+                                                float               epsilon,
+                                                ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
-                                                  mean->info(), var->info(),
-                                                  (beta != nullptr) ? beta->info() : nullptr,
-                                                  (gamma != nullptr) ? gamma->info() : nullptr,
-                                                  epsilon, act_info));
+                                                  mean->info(), var->info(), (beta != nullptr) ? beta->info() : nullptr,
+                                                  (gamma != nullptr) ? gamma->info() : nullptr, epsilon, act_info));
 
     _input    = input;
     _output   = input;
@@ -334,16 +355,16 @@
     _act_info = act_info;
 
     const bool run_in_place = (output == nullptr) || (output == input);
-    if(!run_in_place)
+    if (!run_in_place)
     {
         _output = output;
     }
 
     // Configure activation function to run
     const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
-        if(_act_info.enabled())
+        if (_act_info.enabled())
         {
             configure_fused();
         }
@@ -357,17 +378,21 @@
     Window win = calculate_max_window(*input->info(), Steps());
     INEKernel::configure(win);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 }
 
-Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                                 const ITensorInfo *mean, const ITensorInfo *var,
-                                                 const ITensorInfo *beta, const ITensorInfo *gamma,
-                                                 float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayerKernel::validate(const ITensorInfo  *input,
+                                                 const ITensorInfo  *output,
+                                                 const ITensorInfo  *mean,
+                                                 const ITensorInfo  *var,
+                                                 const ITensorInfo  *beta,
+                                                 const ITensorInfo  *gamma,
+                                                 float               epsilon,
+                                                 ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
 
@@ -382,13 +407,14 @@
     ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW);
 
     const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
         (this->*_func)(window);
     }
     else
     {
-        const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type(), CPUInfo::get() });
+        const auto *uk =
+            get_implementation(BatchNormalizationSelectorData{_input->info()->data_type(), CPUInfo::get()});
         uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window);
     }
 }

diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
index 0551ace..2e8ff0d 100644
--- a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -68,7 +69,13 @@
      * @param[in]      epsilon  (Optional) Small value to avoid division with zero. Default value is 0.001f.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
      */
-    void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta = nullptr, const ITensor *gamma = nullptr, float epsilon = 0.001f,
+    void configure(ITensor            *input,
+                   ITensor            *output,
+                   const ITensor      *mean,
+                   const ITensor      *var,
+                   const ITensor      *beta     = nullptr,
+                   const ITensor      *gamma    = nullptr,
+                   float               epsilon  = 0.001f,
                    ActivationLayerInfo act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchNormalizationLayerKernel
      *
@@ -85,10 +92,14 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const ITensorInfo *mean, const ITensorInfo *var,
-                           const ITensorInfo *beta = nullptr, const ITensorInfo *gamma = nullptr,
-                           float epsilon = 0.001f, ActivationLayerInfo act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo  *input,
+                           const ITensorInfo  *output,
+                           const ITensorInfo  *mean,
+                           const ITensorInfo  *var,
+                           const ITensorInfo  *beta     = nullptr,
+                           const ITensorInfo  *gamma    = nullptr,
+                           float               epsilon  = 0.001f,
+                           ActivationLayerInfo act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
index 83fb5f6..f299bb9 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.cpp

@@ -27,8 +27,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -46,7 +47,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -54,7 +55,11 @@
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, int block_shape_x, int block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status validate_arguments_static(const ITensorInfo *input,
+                                 int                block_shape_x,
+                                 int                block_shape_y,
+                                 const ITensorInfo *output,
+                                 const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
@@ -65,13 +70,14 @@
     const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-        const TensorShape expected_output_shape = compute_batch_to_space_shape(input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
-        const TensorInfo  expected_output       = output->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape = compute_batch_to_space_shape(
+            input->data_layout(), input->tensor_shape(), block_shape_x, block_shape_y, crop_info);
+        const TensorInfo expected_output = output->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output);
     }
 
@@ -80,7 +86,13 @@
 } // namespace
 
 NEBatchToSpaceLayerKernel::NEBatchToSpaceLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _block_shape_x(), _block_shape_y(), _crop_info()
+    : _input(nullptr),
+      _block_shape(nullptr),
+      _output(nullptr),
+      _data_layout(DataLayout::UNKNOWN),
+      _block_shape_x(),
+      _block_shape_y(),
+      _crop_info()
 {
 }
 
@@ -99,15 +111,18 @@
     ICPPKernel::configure(win);
 }
 
-void NEBatchToSpaceLayerKernel::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
+void NEBatchToSpaceLayerKernel::configure(
+    const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    const TensorShape output_shape = compute_batch_to_space_shape(input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
+    const TensorShape output_shape = compute_batch_to_space_shape(
+        input->info()->data_layout(), input->info()->tensor_shape(), block_shape_x, block_shape_y);
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments_static(input->info(), block_shape_x, block_shape_y, output->info(), crop_info));
 
     _input         = input;
     _output        = output;
@@ -121,14 +136,19 @@
     ICPPKernel::configure(win);
 }
 
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_shape, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, output));
     return Status{};
 }
 
-Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status NEBatchToSpaceLayerKernel::validate(const ITensorInfo *input,
+                                           int32_t            block_shape_x,
+                                           int32_t            block_shape_y,
+                                           const ITensorInfo *output,
+                                           const CropInfo    &crop_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, output, crop_info));
@@ -141,7 +161,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_block_shape != nullptr)
+    if (_block_shape != nullptr)
     {
         // Retrieve the block shapes dynamically
         _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
@@ -155,31 +175,32 @@
 
     int batch_id = 0;
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.x();
+                    const int y = id.y();
+                    const int z = id.z();
+                    // Translate x, y to uncropped version
+                    const int x_c = x + _crop_info.left;
+                    const int y_c = y + _crop_info.top;
 
-                const int x = id.x();
-                const int y = id.y();
-                const int z = id.z();
-                // Translate x, y to uncropped version
-                const int x_c = x + _crop_info.left;
-                const int y_c = y + _crop_info.top;
-
-                const int   in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
-                const int   in_x     = x_c / _block_shape_x;
-                const int   in_y     = y_c / _block_shape_y;
-                Coordinates input_coords{ in_x, in_y, z, in_batch };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+                    const int in_batch =
+                        batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+                    const int   in_x = x_c / _block_shape_x;
+                    const int   in_y = y_c / _block_shape_y;
+                    Coordinates input_coords{in_x, in_y, z, in_batch};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
@@ -188,26 +209,28 @@
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.y();
+                    const int y = id.z();
 
-                const int x = id.y();
-                const int y = id.z();
+                    // Translate x, y to uncropped version
+                    const int x_c = x + _crop_info.left;
+                    const int y_c = y + _crop_info.top;
 
-                // Translate x, y to uncropped version
-                const int x_c = x + _crop_info.left;
-                const int y_c = y + _crop_info.top;
-
-                const int   in_batch = batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
-                const int   in_x     = x_c / _block_shape_x;
-                const int   in_y     = y_c / _block_shape_y;
-                Coordinates input_coords{ 0, in_x, in_y, in_batch };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size * _input->info()->dimension(0));
-            },
-            out);
+                    const int in_batch =
+                        batch_id + ((x_c % _block_shape_x) + (y_c % _block_shape_y) * _block_shape_x) * batch_size;
+                    const int   in_x = x_c / _block_shape_x;
+                    const int   in_y = y_c / _block_shape_y;
+                    Coordinates input_coords{0, in_x, in_y, in_batch};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords),
+                           element_size * _input->info()->dimension(0));
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
index 5eceee0..d98ac62 100644
--- a/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h
+++ b/src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEBATCHTOSPACELAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -68,7 +69,11 @@
      * @param[out] output        Tensor output. Data types supported: same as @p input
      * @param[in]  crop_info     Specifies how the output shape is cropped after batch to space is performed
      */
-    void configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info = CropInfo{});
+    void configure(const ITensor  *input,
+                   int32_t         block_shape_x,
+                   int32_t         block_shape_y,
+                   ITensor        *output,
+                   const CropInfo &crop_info = CropInfo{});
     /** Static function to check if given info will lead to a valid configuration of @ref NEBatchToSpaceLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -90,7 +95,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info = CropInfo{});
+    static Status validate(const ITensorInfo *input,
+                           int32_t            block_shape_x,
+                           int32_t            block_shape_y,
+                           const ITensorInfo *output,
+                           const CropInfo    &crop_info = CropInfo{});
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
index 677c5cd..a59bbd2 100644
--- a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp

@@ -27,9 +27,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -55,8 +56,7 @@
 }
 } // namespace
 
-NEBitwiseAndKernel::NEBitwiseAndKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseAndKernel::NEBitwiseAndKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -86,8 +86,7 @@
     Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
                               AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
@@ -103,9 +102,7 @@
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_and<uint8_t>(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }

diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
index 19b1af6..ecd181a 100644
--- a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -50,8 +51,7 @@
 }
 } // namespace
 
-NEBitwiseNotKernel::NEBitwiseNotKernel()
-    : _input(nullptr), _output(nullptr)
+NEBitwiseNotKernel::NEBitwiseNotKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
@@ -77,7 +77,8 @@
     // Configure kernel window
     Window                 win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
-    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access);
+    update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
 
     INEKernel::configure(win);
 }
@@ -90,9 +91,6 @@
     Iterator input(_input, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_not_U8_U8(input.ptr(), output.ptr());
-    },
-    input, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_not_U8_U8(input.ptr(), output.ptr()); }, input, output);
 }

diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
index 08094fb..4c90613 100644
--- a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,7 +43,8 @@
 
 namespace
 {
-inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
 {
     const uint8x16_t val1 = vld1q_u8(input1);
     const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@
 }
 } // namespace
 
-NEBitwiseOrKernel::NEBitwiseOrKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseOrKernel::NEBitwiseOrKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -82,8 +83,7 @@
     Window                 win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration));
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
-    update_window_and_padding(win,
-                              AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
+    update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
                               AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
                               output_access);
 
@@ -99,9 +99,7 @@
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }

diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
index fc5b38b..dbbed24 100644
--- a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp
+++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,7 +43,8 @@
 
 namespace
 {
-inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
+inline void
+bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output)
 {
     const uint8x16_t val1 = vld1q_u8(input1);
     const uint8x16_t val2 = vld1q_u8(input2);
@@ -51,8 +53,7 @@
 }
 } // namespace
 
-NEBitwiseXorKernel::NEBitwiseXorKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr)
+NEBitwiseXorKernel::NEBitwiseXorKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr)
 {
 }
 
@@ -82,7 +83,8 @@
     AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration);
 
     update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration),
-                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access);
+                              AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration),
+                              output_access);
 
     INEKernel::configure(win);
 }
@@ -96,9 +98,7 @@
     Iterator input2(_input2, window);
     Iterator output(_output, window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr());
-    },
-    input1, input2, output);
+    execute_window_loop(
+        window, [&](const Coordinates &) { bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); }, input1,
+        input2, output);
 }

diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
index 69bfd56..cb86983 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.cpp

@@ -27,8 +27,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/boundingboxtransform/list.h"
@@ -45,7 +46,11 @@
 };
 
 using BoundingBoxTransformSelctorPtr = std::add_pointer<bool(const BoundingBoxTransformSelectorData &data)>::type;
-using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)>::type;
+using BoundingBoxTransformUKernelPtr = std::add_pointer<void(const ITensor           *boxes,
+                                                             ITensor                 *pred_boxes,
+                                                             const ITensor           *deltas,
+                                                             BoundingBoxTransformInfo bbinfo,
+                                                             const Window            &window)>::type;
 
 struct BoundingBoxTransformKernel
 {
@@ -54,26 +59,19 @@
     BoundingBoxTransformUKernelPtr       ukernel;
 };
 
-static const BoundingBoxTransformKernel available_kernels[] =
-{
-    {
-        "fp32_neon_boundingboxtransform",
-        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)
-    },
+static const BoundingBoxTransformKernel available_kernels[] = {
+    {"fp32_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_boundingboxtransform)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_boundingboxtransform",
-        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)
-    },
+    {"fp16_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_boundingboxtransform)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "qu16_neon_boundingboxtransform",
-        [](const BoundingBoxTransformSelectorData & data) { return data.dt == DataType::QASYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)
-    },
+    {"qu16_neon_boundingboxtransform",
+     [](const BoundingBoxTransformSelectorData &data) { return data.dt == DataType::QASYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_boundingboxtransform)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
 };
 
@@ -85,9 +83,9 @@
  */
 const BoundingBoxTransformKernel *get_implementation(const BoundingBoxTransformSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -95,7 +93,10 @@
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status validate_arguments(const ITensorInfo              *boxes,
+                          const ITensorInfo              *pred_boxes,
+                          const ITensorInfo              *deltas,
+                          const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(boxes);
@@ -108,7 +109,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(info.scale() <= 0);
 
-    if(boxes->data_type() == DataType::QASYMM16)
+    if (boxes->data_type() == DataType::QASYMM16)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(deltas, 1, DataType::QASYMM8);
         const UniformQuantizationInfo deltas_qinfo = deltas->quantization_info().uniform();
@@ -120,12 +121,12 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes, deltas);
     }
 
-    if(pred_boxes->total_size() > 0)
+    if (pred_boxes->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(pred_boxes->tensor_shape(), deltas->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(pred_boxes, deltas);
         ARM_COMPUTE_RETURN_ERROR_ON(pred_boxes->num_dimensions() > 2);
-        if(pred_boxes->data_type() == DataType::QASYMM16)
+        if (pred_boxes->data_type() == DataType::QASYMM16)
         {
             const UniformQuantizationInfo pred_qinfo = pred_boxes->quantization_info().uniform();
             ARM_COMPUTE_RETURN_ERROR_ON(pred_qinfo.scale != 0.125f);
@@ -142,13 +143,19 @@
 {
 }
 
-void NEBoundingBoxTransformKernel::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransformKernel::configure(const ITensor                  *boxes,
+                                             ITensor                        *pred_boxes,
+                                             const ITensor                  *deltas,
+                                             const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(boxes, pred_boxes, deltas);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(boxes->info(), pred_boxes->info(), deltas->info(), info));
 
     // Configure kernel window
-    auto_init_if_empty(*pred_boxes->info(), deltas->info()->clone()->set_data_type(boxes->info()->data_type()).set_quantization_info(boxes->info()->quantization_info()));
+    auto_init_if_empty(*pred_boxes->info(), deltas->info()
+                                                ->clone()
+                                                ->set_data_type(boxes->info()->data_type())
+                                                .set_quantization_info(boxes->info()->quantization_info()));
 
     // Set instance variables
     _boxes      = boxes;
@@ -164,7 +171,10 @@
     INEKernel::configure(win);
 }
 
-Status NEBoundingBoxTransformKernel::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransformKernel::validate(const ITensorInfo              *boxes,
+                                              const ITensorInfo              *pred_boxes,
+                                              const ITensorInfo              *deltas,
+                                              const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(boxes, pred_boxes, deltas, info));
     return Status{};
@@ -176,7 +186,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto *uk = get_implementation(BoundingBoxTransformSelectorData{ _boxes->info()->data_type() });
+    const auto *uk = get_implementation(BoundingBoxTransformSelectorData{_boxes->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_boxes, _pred_boxes, _deltas, _bbinfo, window);

diff --git a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
index def8278..3915994 100644
--- a/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h
+++ b/src/core/NEON/kernels/NEBoundingBoxTransformKernel.h

@@ -63,7 +63,8 @@
      * @note Only single image prediction is supported. Height and Width (and scale) of the image will be contained in the BoundingBoxTransformInfo struct.
      *
      */
-    void configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
+    void
+    configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLBoundingBoxTransform
      *
@@ -77,7 +78,10 @@
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info);
+    static Status validate(const ITensorInfo              *boxes,
+                           const ITensorInfo              *pred_boxes,
+                           const ITensorInfo              *deltas,
+                           const BoundingBoxTransformInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
index 64da1f2..3b53b70 100644
--- a/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEChannelShuffleLayerKernel.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,15 +45,19 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NCHW, DataLayout::NHWC);
 
-    const unsigned int channels = input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
+    const unsigned int channels =
+        input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups < 2, "Channel shuffling with less than 2 groups would be inefficient");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups == channels, "Channel shuffling with same number of groups as number of channels would be inefficient");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        num_groups == channels,
+        "Channel shuffling with same number of groups as number of channels would be inefficient");
     ARM_COMPUTE_RETURN_ERROR_ON(num_groups > channels); // There cannot be more groups than channels
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0, "The number of channels must be a multiple of the number of groups");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((channels % num_groups) != 0,
+                                    "The number of channels must be a multiple of the number of groups");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -72,20 +77,22 @@
 
     Iterator in(input, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Shuffle channel
-        const unsigned int curr_channel = id.x();
-        const unsigned int group_id     = curr_channel * rK;
-        const unsigned int r            = group_id * K;
-        const unsigned int channel_id   = curr_channel - r;
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            // Shuffle channel
+            const unsigned int curr_channel = id.x();
+            const unsigned int group_id     = curr_channel * rK;
+            const unsigned int r            = group_id * K;
+            const unsigned int channel_id   = curr_channel - r;
 
-        // Calculate output coordinates
-        Coordinates out_coords = id;
-        out_coords.set(Window::DimX, channel_id * num_groups + group_id);
-        std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
-    },
-    in);
+            // Calculate output coordinates
+            Coordinates out_coords = id;
+            out_coords.set(Window::DimX, channel_id * num_groups + group_id);
+            std::copy_n(in.ptr(), element_size, output->ptr_to_element(out_coords));
+        },
+        in);
 }
 void channel_shuffle_nchw(const ITensor *input, ITensor *output, unsigned int num_groups, const Window &window)
 {
@@ -107,34 +114,35 @@
 
     Iterator in(input, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        // Shuffle channel
-        const unsigned int curr_channel = id.z();
-        const unsigned int group_id     = curr_channel * rK;
-        const unsigned int r            = group_id * K;
-        const unsigned int channel_id   = curr_channel - r;
-
-        // Calculate output coordinates
-        Coordinates out_coords = id;
-        out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
-        const uint8_t *input_ptr  = in.ptr();
-        uint8_t       *output_ptr = output->ptr_to_element(out_coords);
-
-        // Copy plane
-        for(unsigned int y = 0; y < height; ++y)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            std::copy_n(input_ptr, row_size, output_ptr);
-            input_ptr += input_stride_y;
-            output_ptr += output_stride_y;
-        }
-    },
-    in);
+            // Shuffle channel
+            const unsigned int curr_channel = id.z();
+            const unsigned int group_id     = curr_channel * rK;
+            const unsigned int r            = group_id * K;
+            const unsigned int channel_id   = curr_channel - r;
+
+            // Calculate output coordinates
+            Coordinates out_coords = id;
+            out_coords.set(Window::DimZ, channel_id * num_groups + group_id);
+            const uint8_t *input_ptr  = in.ptr();
+            uint8_t       *output_ptr = output->ptr_to_element(out_coords);
+
+            // Copy plane
+            for (unsigned int y = 0; y < height; ++y)
+            {
+                std::copy_n(input_ptr, row_size, output_ptr);
+                input_ptr += input_stride_y;
+                output_ptr += output_stride_y;
+            }
+        },
+        in);
 }
 } // namespace
 
-NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel()
-    : _input(nullptr), _output(nullptr), _num_groups()
+NEChannelShuffleLayerKernel::NEChannelShuffleLayerKernel() : _input(nullptr), _output(nullptr), _num_groups()
 {
 }
 
@@ -158,7 +166,8 @@
     INEKernel::configure(win);
 }
 
-Status NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
+Status
+NEChannelShuffleLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, num_groups));
     return Status{};
@@ -170,7 +179,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    switch(_input->info()->data_layout())
+    switch (_input->info()->data_layout())
     {
         case DataLayout::NHWC:
             channel_shuffle_nhwc(_input, _output, _num_groups, window);

diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
index 1976302..bc6652f 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.h
+++ b/src/core/NEON/kernels/NECol2ImKernel.h

@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_NECOL2IMKERNEL_H
 #define ARM_COMPUTE_NECOL2IMKERNEL_H
 
-#include "src/core/NEON/INEKernel.h"
-
 #include "arm_compute/core/Size2D.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 class ITensor;

diff --git a/src/core/NEON/kernels/NECropKernel.cpp b/src/core/NEON/kernels/NECropKernel.cpp
index 94c4553..60271fb 100644
--- a/src/core/NEON/kernels/NECropKernel.cpp
+++ b/src/core/NEON/kernels/NECropKernel.cpp

@@ -26,14 +26,15 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/helpers/bit_ops.h"
 #include "src/cpu/kernels/crop/list.h"
 
@@ -47,7 +48,8 @@
 };
 
 using CropSelectorPtr = std::add_pointer<bool(const CropSelectorData &data)>::type;
-using CropUKernelPtr  = std::add_pointer<void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
+using CropUKernelPtr  = std::add_pointer<void(
+    const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool)>::type;
 
 struct CropUKernel
 {
@@ -56,48 +58,23 @@
     CropUKernelPtr        ukernel;
 };
 
-static const CropUKernel available_kernels[] =
-{
-    {
-        "fp16_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)
-    },
-    {
-        "f32_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)
-    },
-    {
-        "u8_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)
-    },
-    {
-        "u16_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::U16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)
-    },
-    {
-        "u32_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::U32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)
-    },
-    {
-        "s8_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::S8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)
-    },
-    {
-        "s16_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)
-    },
-    {
-        "s32_neon_crop",
-        [](const CropSelectorData & data) { return data.dt == DataType::S32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)
-    },
+static const CropUKernel available_kernels[] = {
+    {"fp16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_in_bounds_crop_window)},
+    {"f32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_in_bounds_crop_window)},
+    {"u8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_in_bounds_crop_window)},
+    {"u16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u16_in_bounds_crop_window)},
+    {"u32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::U32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u32_in_bounds_crop_window)},
+    {"s8_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_in_bounds_crop_window)},
+    {"s16_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_in_bounds_crop_window)},
+    {"s32_neon_crop", [](const CropSelectorData &data) { return data.dt == DataType::S32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s32_in_bounds_crop_window)},
 };
 
 /** Micro-kernel selector
@@ -108,9 +85,9 @@
  */
 const CropUKernel *get_implementation(const CropSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -119,26 +96,40 @@
     return nullptr;
 }
 
-inline void out_of_bounds_crop_window(const ITensor *output, float *output_ptr, float extrapolation_value,
-                                      int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit)
+inline void out_of_bounds_crop_window(const ITensor *output,
+                                      float         *output_ptr,
+                                      float          extrapolation_value,
+                                      int32_t        window_step_x,
+                                      int32_t        output_width_start,
+                                      int32_t        output_width_limit)
 {
-    auto    in               = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
-    int32_t x                = 0;
-    int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
-    float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-    for(; x <= limit - window_step_x; x += window_step_x)
+    auto    in    = wrapper::vdup_n(extrapolation_value, wrapper::traits::vector_128_tag());
+    int32_t x     = 0;
+    int32_t limit = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+    float  *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
+    for (; x <= limit - window_step_x; x += window_step_x)
     {
         wrapper::vstore(output_start_ptr + x, in);
     }
-    for(; x < limit; ++x)
+    for (; x < limit; ++x)
     {
         *(output_start_ptr + x) = extrapolation_value;
     }
 }
 
-inline void execute_window(const ITensor *input, const ITensor *output, Coordinates input_offset, float extrapolation_value,
-                           const std::array<uint32_t, 2> &rows_out_of_bounds, const std::array<uint32_t, 2> &cols_out_of_bounds, NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
-                           bool is_height_flipped, bool has_cols_in_bounds, bool has_cols_out_of_bounds_before, bool has_cols_out_of_bounds_after, bool input_has_single_channel, bool is_width_flipped)
+inline void execute_window(const ITensor                      *input,
+                           const ITensor                      *output,
+                           Coordinates                         input_offset,
+                           float                               extrapolation_value,
+                           const std::array<uint32_t, 2>      &rows_out_of_bounds,
+                           const std::array<uint32_t, 2>      &cols_out_of_bounds,
+                           NECropKernel::InBoundsCropFunction *in_bounds_crop_function,
+                           bool                                is_height_flipped,
+                           bool                                has_cols_in_bounds,
+                           bool                                has_cols_out_of_bounds_before,
+                           bool                                has_cols_out_of_bounds_after,
+                           bool                                input_has_single_channel,
+                           bool                                is_width_flipped)
 {
     // Output is always float.
     const int window_step_x = 16 / sizeof(float);
@@ -159,45 +150,66 @@
     //  |------------------------------|
     // Fill all output rows that have no elements that are within the input bounds with the extrapolation value.
     // First for the rows before the in bounds rows.
-    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[0] * output->info()->dimension(1));
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+                              rows_out_of_bounds[0] * output->info()->dimension(1));
     output_ptr += rows_out_of_bounds[0] * output->info()->dimension(1) * output->info()->dimension(0);
     // Iterate through each row that has any elements within the input bounds.
-    for(uint32_t row = rows_out_of_bounds[0]; static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
-        ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
+    for (uint32_t row = rows_out_of_bounds[0];
+         static_cast<int32_t>(row) < static_cast<int32_t>(output->info()->dimension(2) - rows_out_of_bounds[1]);
+         ++row, is_height_flipped ? --input_offset[2] : ++input_offset[2])
     {
         // Fill all elements in the row that are out of bounds with the extrapolation value.
         // First for the elements before the in bounds elements.
-        if(has_cols_out_of_bounds_before)
+        if (has_cols_out_of_bounds_before)
         {
             out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, cols_out_of_bounds[0]);
         }
         // Copy all elements within the input bounds from the input tensor.
-        if(has_cols_in_bounds)
+        if (has_cols_in_bounds)
         {
             (*in_bounds_crop_function)(input, output, output_ptr, input_offset, window_step_x, cols_out_of_bounds[0],
-                                       output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel, is_width_flipped);
+                                       output->info()->dimension(1) - cols_out_of_bounds[1], input_has_single_channel,
+                                       is_width_flipped);
         }
         // Fill all elements after the in bounds elements with the extrapolation value.
-        if(has_cols_out_of_bounds_after)
+        if (has_cols_out_of_bounds_after)
         {
-            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, output->info()->dimension(1) - cols_out_of_bounds[1], output->info()->dimension(1));
+            out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x,
+                                      output->info()->dimension(1) - cols_out_of_bounds[1],
+                                      output->info()->dimension(1));
         }
         output_ptr += output->info()->dimension(1) * output->info()->dimension(0);
     }
     // Fill all rows after the in bounds elements with the extrapolation value.
-    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0, rows_out_of_bounds[1] * output->info()->dimension(1));
+    out_of_bounds_crop_window(output, output_ptr, extrapolation_value, window_step_x, 0,
+                              rows_out_of_bounds[1] * output->info()->dimension(1));
 }
 } // namespace
 
 NECropKernel::NECropKernel()
-    : _input(nullptr), _crop_boxes(nullptr), _box_ind(nullptr), _output(nullptr), _start(), _end(), _crop_box_ind(0), _extrapolation_value(0), _rows_out_of_bounds(), _cols_out_of_bounds()
+    : _input(nullptr),
+      _crop_boxes(nullptr),
+      _box_ind(nullptr),
+      _output(nullptr),
+      _start(),
+      _end(),
+      _crop_box_ind(0),
+      _extrapolation_value(0),
+      _rows_out_of_bounds(),
+      _cols_out_of_bounds()
 {
 }
 
-void NECropKernel::configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind, float extrapolation_value)
+void NECropKernel::configure(const ITensor *input,
+                             const ITensor *crop_boxes,
+                             const ITensor *box_ind,
+                             ITensor       *output,
+                             uint32_t       crop_box_ind,
+                             float          extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(), crop_box_ind, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), crop_boxes->info(), box_ind->info(), output->info(),
+                                        crop_box_ind, extrapolation_value));
 
     _input               = input;
     _crop_boxes          = crop_boxes;
@@ -207,21 +219,27 @@
     _extrapolation_value = extrapolation_value;
 }
 
-Status NECropKernel::validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind, float extrapolation_value)
+Status NECropKernel::validate(const ITensorInfo *input,
+                              const ITensorInfo *crop_boxes,
+                              const ITensorInfo *box_ind,
+                              const ITensorInfo *output,
+                              uint32_t           crop_box_ind,
+                              float              extrapolation_value)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value);
-    const auto *uk = get_implementation(CropSelectorData{ input->data_type() });
+    const auto *uk = get_implementation(CropSelectorData{input->data_type()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16, DataType::F16, DataType::U32, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::U16, DataType::S16,
+                                                         DataType::F16, DataType::U32, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
     ARM_COMPUTE_RETURN_ERROR_ON(crop_boxes->tensor_shape()[1] <= crop_box_ind);
     ARM_COMPUTE_RETURN_ERROR_ON(box_ind->tensor_shape()[0] <= crop_box_ind);
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -242,48 +260,53 @@
     // The normalized coordiantes are scaled to retrieve the floating point image coordinates which are rounded to integers.
     _start = Coordinates(std::floor(x0 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
                          std::floor(y0 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
-    _end = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
-                       std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1, abs(_end[1] - _start[1]) + 1);
+    _end   = Coordinates(std::floor(x1 * (_input->info()->tensor_shape()[1] - 1) + 0.5f),
+                         std::floor(y1 * (_input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(_input->info()->tensor_shape()[0], abs(_end[0] - _start[0]) + 1,
+                                abs(_end[1] - _start[1]) + 1);
     _output->info()->set_tensor_shape(out_shape);
 
     bool is_width_flipped  = _end[0] < _start[0];
     bool is_height_flipped = _end[1] < _start[1];
-    if(is_height_flipped)
+    if (is_height_flipped)
     {
-        _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
-                                                                                                            static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+        _rows_out_of_bounds[0] = _start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                     ? std::min(static_cast<uint32_t>(_start[1] - _input->info()->dimension(2) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(2)))
+                                     : 0;
         _rows_out_of_bounds[1] = _end[1] < 0 ? std::min(static_cast<uint32_t>(-_end[1]),
-                                                        static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+                                                        static_cast<uint32_t>(_output->info()->dimension(2)))
+                                             : 0;
     }
     else
     {
         _rows_out_of_bounds[0] = _start[1] < 0 ? std::min(static_cast<uint32_t>(-_start[1]),
-                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
-        _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
-                                                                                                          static_cast<uint32_t>(_output->info()->dimension(2))) :
-                                 0;
+                                                          static_cast<uint32_t>(_output->info()->dimension(2)))
+                                               : 0;
+        _rows_out_of_bounds[1] = _end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                     ? std::min(static_cast<uint32_t>(_end[1] - _input->info()->dimension(2) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(2)))
+                                     : 0;
     }
-    if(is_width_flipped)
+    if (is_width_flipped)
     {
-        _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
-                                                                                                            static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+        _cols_out_of_bounds[0] = _start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                     ? std::min(static_cast<uint32_t>(_start[0] - _input->info()->dimension(1) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(1)))
+                                     : 0;
         _cols_out_of_bounds[1] = _end[0] < 0 ? std::min(static_cast<uint32_t>(-_end[0]),
-                                                        static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+                                                        static_cast<uint32_t>(_output->info()->dimension(1)))
+                                             : 0;
     }
     else
     {
         _cols_out_of_bounds[0] = _start[0] < 0 ? std::min(static_cast<uint32_t>(-_start[0]),
-                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
-        _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
-                                                                                                          static_cast<uint32_t>(_output->info()->dimension(1))) :
-                                 0;
+                                                          static_cast<uint32_t>(_output->info()->dimension(1)))
+                                               : 0;
+        _cols_out_of_bounds[1] = _end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                     ? std::min(static_cast<uint32_t>(_end[0] - _input->info()->dimension(1) + 1),
+                                                static_cast<uint32_t>(_output->info()->dimension(1)))
+                                     : 0;
     }
 
     INEKernel::configure(calculate_max_window(*_output->info()));
@@ -298,13 +321,18 @@
     ARM_COMPUTE_ERROR_ON(_input->info()->has_padding());
     ARM_COMPUTE_ERROR_ON(_output->info()->has_padding());
 
-    const auto *uk = get_implementation(CropSelectorData{ _input->info()->data_type() });
+    const auto *uk = get_implementation(CropSelectorData{_input->info()->data_type()});
 
     uint32_t    batch_index = *(reinterpret_cast<int32_t *>(_box_ind->ptr_to_element(Coordinates(_crop_box_ind))));
-    Coordinates input_offset(0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
-                             _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
-    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds, uk->ukernel, _end[1] < _start[1],
-                   _cols_out_of_bounds[0] + _cols_out_of_bounds[1] < _output->info()->dimension(1), _cols_out_of_bounds[0] > 0, _cols_out_of_bounds[1] > 0,
+    Coordinates input_offset(
+        0, _end[0] < _start[0] ? _start[0] - _cols_out_of_bounds[0] : _start[0] + _cols_out_of_bounds[0],
+        _end[1] < _start[1] ? _start[1] - _rows_out_of_bounds[0] : _start[1] + _rows_out_of_bounds[0], batch_index);
+    execute_window(_input, _output, input_offset, _extrapolation_value, _rows_out_of_bounds, _cols_out_of_bounds,
+                   uk->ukernel,
+                   _end[1]<_start[1],
+                           _cols_out_of_bounds[0] +
+                               _cols_out_of_bounds[1]<_output->info()->dimension(1), _cols_out_of_bounds[0]> 0,
+                           _cols_out_of_bounds[1]> 0,
                    _start[0] <= _end[0], _end[0] < _start[0]);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NECropKernel.h b/src/core/NEON/kernels/NECropKernel.h
index 6c989c1..da4a1b2 100644
--- a/src/core/NEON/kernels/NECropKernel.h
+++ b/src/core/NEON/kernels/NECropKernel.h

@@ -25,7 +25,7 @@
 #define ARM_COMPUTE_NEON_CROP_KERNEL_H
 
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -67,7 +67,12 @@
      * @param[in]  crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      */
-    void configure(const ITensor *input, const ITensor *crop_boxes, const ITensor *box_ind, ITensor *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+    void configure(const ITensor *input,
+                   const ITensor *crop_boxes,
+                   const ITensor *box_ind,
+                   ITensor       *output,
+                   uint32_t       crop_box_ind        = 0,
+                   float          extrapolation_value = 0);
 
     /** Static function to check if given info will lead to a valid configuration of @ref CLStridedSliceKernel
      *
@@ -82,7 +87,12 @@
      * @param[in] crop_box_ind        Index of the crop box to be used from @p crop_boxes. Default is 0.
      * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *crop_boxes, const ITensorInfo *box_ind, const ITensorInfo *output, uint32_t crop_box_ind = 0, float extrapolation_value = 0);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *crop_boxes,
+                           const ITensorInfo *box_ind,
+                           const ITensorInfo *output,
+                           uint32_t           crop_box_ind        = 0,
+                           float              extrapolation_value = 0);
 
     /** Configure output tensor's shape as this can only be determined at runtime. */
     void configure_output_shape();
@@ -91,7 +101,8 @@
     void run(const Window &window, const ThreadInfo &info) override;
 
     /** Function to use for in bounds crop for the particular tensor types passed to configure() */
-    using InBoundsCropFunction = void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
+    using InBoundsCropFunction =
+        void(const ITensor *, const ITensor *, float *, Coordinates, int32_t, int32_t, int32_t, bool, bool);
 
 private:
     const ITensor *_input;

diff --git a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
index 6dcc85e..de0079e 100644
--- a/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDepthToSpaceLayerKernel.cpp

@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -52,12 +53,14 @@
     const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] % (block_shape * block_shape) != 0);
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] != (block_shape * input->tensor_shape()[idx_width]));
-        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] != (block_shape * input->tensor_shape()[idx_height]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] !=
+                                    (block_shape * input->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_height] !=
+                                    (block_shape * input->tensor_shape()[idx_height]));
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 4);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -74,7 +77,8 @@
 void NEDepthToSpaceLayerKernel::configure(const ITensor *input, ITensor *output, int32_t block_shape)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    TensorShape output_shape = compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
+    TensorShape output_shape =
+        compute_depth_to_space_shape(input->info()->tensor_shape(), input->info()->data_layout(), block_shape);
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
@@ -117,26 +121,27 @@
     slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         Window slice_in = window.first_slice_window_2D();
         do
         {
             Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-                const int x = id.x();
-                const int y = id.y();
+            execute_window_loop(
+                slice_in,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.x();
+                    const int y = id.y();
 
-                const int   z     = id.z() % r;
-                const int   out_x = x * _block_shape + (id.z() / r) % _block_shape;
-                const int   out_y = y * _block_shape + (id.z() / r) / _block_shape;
-                Coordinates output_coords{ out_x, out_y, z, id[3] };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
-        }
-        while(window.slide_window_slice_2D(slice_in));
+                    const int   z     = id.z() % r;
+                    const int   out_x = x * _block_shape + (id.z() / r) % _block_shape;
+                    const int   out_y = y * _block_shape + (id.z() / r) / _block_shape;
+                    Coordinates output_coords{out_x, out_y, z, id[3]};
+                    memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                },
+                in);
+        } while (window.slide_window_slice_2D(slice_in));
     }
     else
     {
@@ -144,20 +149,21 @@
         do
         {
             Iterator in(_input, slice_in);
-            execute_window_loop(slice_in, [&](const Coordinates & id)
-            {
-                const int x = id.y();
-                const int y = id.z();
+            execute_window_loop(
+                slice_in,
+                [&](const Coordinates &id)
+                {
+                    const int x = id.y();
+                    const int y = id.z();
 
-                const int   z     = id.x() % r;
-                const int   out_x = x * _block_shape + (id.x() / r) % _block_shape;
-                const int   out_y = y * _block_shape + (id.x() / r) / _block_shape;
-                Coordinates output_coords{ z, out_x, out_y, id[3] };
-                memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
-            },
-            in);
-        }
-        while(window.slide_window_slice_3D(slice_in));
+                    const int   z     = id.x() % r;
+                    const int   out_x = x * _block_shape + (id.x() / r) % _block_shape;
+                    const int   out_y = y * _block_shape + (id.x() / r) / _block_shape;
+                    Coordinates output_coords{z, out_x, out_y, id[3]};
+                    memcpy(_output->ptr_to_element(output_coords), in.ptr(), element_size);
+                },
+                in);
+        } while (window.slide_window_slice_3D(slice_in));
     }
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
index 261437f..a5969cd 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -37,16 +38,19 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status validate_arguments(const ITensorInfo               *input,
+                          const ITensorInfo               *output,
+                          const ITensorInfo               *idx,
+                          const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[config.axis] != idx->tensor_shape().x());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -56,7 +60,10 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo                     *input,
+                                                        ITensorInfo                     *output,
+                                                        ITensorInfo                     *idx,
+                                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(idx, config);
 
@@ -68,12 +75,14 @@
 }
 } // namespace
 
-NEFFTDigitReverseKernel::NEFFTDigitReverseKernel()
-    : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
+NEFFTDigitReverseKernel::NEFFTDigitReverseKernel() : _func(nullptr), _input(nullptr), _output(nullptr), _idx(nullptr)
 {
 }
 
-void NEFFTDigitReverseKernel::configure(const ITensor *input, ITensor *output, const ITensor *idx, const FFTDigitReverseKernelInfo &config)
+void NEFFTDigitReverseKernel::configure(const ITensor                   *input,
+                                        ITensor                         *output,
+                                        const ITensor                   *idx,
+                                        const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), config));
@@ -91,11 +100,11 @@
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 
-    if(axis == 0)
+    if (axis == 0)
     {
-        if(is_input_complex)
+        if (is_input_complex)
         {
-            if(is_conj)
+            if (is_conj)
             {
                 _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<true, true>;
             }
@@ -109,11 +118,11 @@
             _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_0<false, false>;
         }
     }
-    else if(axis == 1)
+    else if (axis == 1)
     {
-        if(is_input_complex)
+        if (is_input_complex)
         {
-            if(is_conj)
+            if (is_conj)
             {
                 _func = &NEFFTDigitReverseKernel::digit_reverse_kernel_axis_1<true, true>;
             }
@@ -133,10 +142,14 @@
     }
 }
 
-Status NEFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config)
+Status NEFFTDigitReverseKernel::validate(const ITensorInfo               *input,
+                                         const ITensorInfo               *output,
+                                         const ITensorInfo               *idx,
+                                         const FFTDigitReverseKernelInfo &config)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), config).first);
     return Status{};
 }
 
@@ -159,38 +172,40 @@
     std::vector<float> buffer_row_out(2 * N);
     std::vector<float> buffer_row_in(2 * N);
 
-    execute_window_loop(slice, [&](const Coordinates &)
-    {
-        if(is_input_complex)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &)
         {
-            // Load
-            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
-
-            // Shuffle
-            for(size_t x = 0; x < 2 * N; x += 2)
+            if (is_input_complex)
             {
-                size_t idx            = buffer_idx[x / 2];
-                buffer_row_out[x]     = buffer_row_in[2 * idx];
-                buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
-            }
-        }
-        else
-        {
-            // Load
-            memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
+                // Load
+                memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), 2 * N * sizeof(float));
 
-            // Shuffle
-            for(size_t x = 0; x < N; ++x)
+                // Shuffle
+                for (size_t x = 0; x < 2 * N; x += 2)
+                {
+                    size_t idx            = buffer_idx[x / 2];
+                    buffer_row_out[x]     = buffer_row_in[2 * idx];
+                    buffer_row_out[x + 1] = (is_conj ? -buffer_row_in[2 * idx + 1] : buffer_row_in[2 * idx + 1]);
+                }
+            }
+            else
             {
-                size_t idx            = buffer_idx[x];
-                buffer_row_out[2 * x] = buffer_row_in[idx];
-            }
-        }
+                // Load
+                memcpy(buffer_row_in.data(), reinterpret_cast<float *>(in.ptr()), N * sizeof(float));
 
-        // Copy back
-        memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
-    },
-    in, out);
+                // Shuffle
+                for (size_t x = 0; x < N; ++x)
+                {
+                    size_t idx            = buffer_idx[x];
+                    buffer_row_out[2 * x] = buffer_row_in[idx];
+                }
+            }
+
+            // Copy back
+            memcpy(reinterpret_cast<float *>(out.ptr()), buffer_row_out.data(), 2 * N * sizeof(float));
+        },
+        in, out);
 }
 
 template <bool is_input_complex, bool is_conj>
@@ -215,39 +230,41 @@
     const size_t stride_z = _input->info()->strides_in_bytes()[2];
     const size_t stride_w = _input->info()->strides_in_bytes()[3];
 
-    execute_window_loop(slice, [&](const Coordinates & id)
-    {
-        auto        *out_ptr    = reinterpret_cast<float *>(out.ptr());
-        auto        *in_ptr     = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
-        const size_t y_shuffled = buffer_idx[id.y()];
-
-        if(is_input_complex)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &id)
         {
-            // Shuffle the entire row into the output
-            memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+            auto        *out_ptr = reinterpret_cast<float *>(out.ptr());
+            auto        *in_ptr  = reinterpret_cast<float *>(_input->buffer() + id.z() * stride_z + id[3] * stride_w);
+            const size_t y_shuffled = buffer_idx[id.y()];
 
-            // Conjugate if necessary
-            if(is_conj)
+            if (is_input_complex)
             {
-                for(size_t x = 0; x < 2 * Nx; x += 2)
+                // Shuffle the entire row into the output
+                memcpy(out_ptr, in_ptr + 2 * Nx * y_shuffled, 2 * Nx * sizeof(float));
+
+                // Conjugate if necessary
+                if (is_conj)
                 {
-                    out_ptr[x + 1] = -out_ptr[x + 1];
+                    for (size_t x = 0; x < 2 * Nx; x += 2)
+                    {
+                        out_ptr[x + 1] = -out_ptr[x + 1];
+                    }
                 }
             }
-        }
-        else
-        {
-            // Shuffle the entire row into the buffer
-            memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
-
-            // Copy the buffer to the output, with a zero imaginary part
-            for(size_t x = 0; x < 2 * Nx; x += 2)
+            else
             {
-                out_ptr[x] = buffer_row[x / 2];
+                // Shuffle the entire row into the buffer
+                memcpy(buffer_row.data(), in_ptr + Nx * y_shuffled, Nx * sizeof(float));
+
+                // Copy the buffer to the output, with a zero imaginary part
+                for (size_t x = 0; x < 2 * Nx; x += 2)
+                {
+                    out_ptr[x] = buffer_row[x / 2];
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 void NEFFTDigitReverseKernel::run(const Window &window, const ThreadInfo &info)

diff --git a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
index f436c36..ecf85eb 100644
--- a/src/core/NEON/kernels/NEFFTDigitReverseKernel.h
+++ b/src/core/NEON/kernels/NEFFTDigitReverseKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEFFTDIGITREVERSEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -70,7 +71,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, const FFTDigitReverseKernelInfo &config);
+    static Status validate(const ITensorInfo               *input,
+                           const ITensorInfo               *output,
+                           const ITensorInfo               *idx,
+                           const FFTDigitReverseKernelInfo &config);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
index 44c841f..4b58a7b 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.cpp

@@ -28,10 +28,11 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/traits.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "support/ToolchainSupport.h"
 
 #include <arm_neon.h>
@@ -70,7 +71,7 @@
 {
     using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
 
-    const float32x2_t mask = { -1.0, 1.0 };
+    const float32x2_t mask = {-1.0, 1.0};
     const float32x2_t tmp0 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
     const float32x2_t tmp1 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
 
@@ -88,7 +89,7 @@
     const float a_r = wrapper::vgetlane(a, 0);
     const float a_i = wrapper::vgetlane(a, 1);
 
-    const auto out = wrapper::vmul(float32x2_t{ -a_i, a_r }, float32x2_t{ img_constant, img_constant });
+    const auto out = wrapper::vmul(float32x2_t{-a_i, a_r}, float32x2_t{img_constant, img_constant});
     return out;
 }
 
@@ -100,7 +101,8 @@
     return wrapper::vadd(t2, e);
 }
 
-float32x2_t reduce_sum_7(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
+float32x2_t reduce_sum_7(
+    float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7)
 {
     const auto t0  = wrapper::vadd(x1, x2);
     const auto t1  = wrapper::vadd(x3, x4);
@@ -111,7 +113,14 @@
     return wrapper::vadd(t00, t01);
 }
 
-float32x2_t reduce_sum_8(float32x2_t x1, float32x2_t x2, float32x2_t x3, float32x2_t x4, float32x2_t x5, float32x2_t x6, float32x2_t x7, float32x2_t x8)
+float32x2_t reduce_sum_8(float32x2_t x1,
+                         float32x2_t x2,
+                         float32x2_t x3,
+                         float32x2_t x4,
+                         float32x2_t x5,
+                         float32x2_t x6,
+                         float32x2_t x7,
+                         float32x2_t x8)
 {
     const auto t0  = wrapper::vadd(x1, x2);
     const auto t1  = wrapper::vadd(x3, x4);
@@ -141,15 +150,21 @@
     x = wrapper::vadd(a, b);
     x = wrapper::vadd(x, c);
 
-    const auto v1 = wrapper::vmul(float32x2_t{ 0.5f, 0.5 }, wrapper::vadd(b, c));
-    const auto v2 = c_mul_neon(float32x2_t{ 0.f, -kSqrt3Div2 }, wrapper::vsub(b, c));
+    const auto v1 = wrapper::vmul(float32x2_t{0.5f, 0.5}, wrapper::vadd(b, c));
+    const auto v2 = c_mul_neon(float32x2_t{0.f, -kSqrt3Div2}, wrapper::vsub(b, c));
 
     y = z = wrapper::vsub(a, v1);
     y     = wrapper::vadd(y, v2);
     z     = wrapper::vsub(z, v2);
 }
 
-void fft_4(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3)
+void fft_4(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3)
 {
     float32x2_t a = x1;
     float32x2_t b = c_mul_neon(w, x2);
@@ -173,7 +188,15 @@
     x4             = wrapper::vadd(x41, x42);
 }
 
-void fft_5(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3, const float32x2_t &w4)
+void fft_5(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3,
+           const float32x2_t &w4)
 {
     const auto a = x1;
     const auto b = c_mul_neon(w, x2);
@@ -181,25 +204,25 @@
     const auto d = c_mul_neon(w3, x4);
     const auto e = c_mul_neon(w4, x5);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, b);
+    const auto b0 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, b);
+    const auto b1 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, b);
+    const auto b3 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, b);
 
-    const auto c0 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, c);
+    const auto c0 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, c);
+    const auto c1 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, c);
+    const auto c2 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, c);
 
-    const auto d0 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, d);
+    const auto d0 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, d);
+    const auto d1 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, d);
 
-    const auto e0 = c_mul_neon(float32x2_t{ kW5_0, kW5_1 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ -kW5_2, kW5_3 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -kW5_2, -kW5_3 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ kW5_0, -kW5_1 }, e);
+    const auto e0 = c_mul_neon(float32x2_t{kW5_0, kW5_1}, e);
+    const auto e1 = c_mul_neon(float32x2_t{-kW5_2, kW5_3}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-kW5_2, -kW5_3}, e);
+    const auto e3 = c_mul_neon(float32x2_t{kW5_0, -kW5_1}, e);
 
     x1 = reduce_sum_5(a, b, c, d, e);
     x2 = reduce_sum_5(a, b0, c0, d0, e0);
@@ -208,9 +231,19 @@
     x5 = reduce_sum_5(a, b3, c3, d3, e3);
 }
 
-void fft_7(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, const float32x2_t &w, const float32x2_t &w2, const float32x2_t &w3,
+void fft_7(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           float32x2_t       &x6,
+           float32x2_t       &x7,
+           const float32x2_t &w,
+           const float32x2_t &w2,
+           const float32x2_t &w3,
            const float32x2_t &w4,
-           const float32x2_t &w5, const float32x2_t &w6)
+           const float32x2_t &w5,
+           const float32x2_t &w6)
 {
     const auto a = x1;
     const auto b = c_mul_neon(w, x2);
@@ -220,47 +253,47 @@
     const auto f = c_mul_neon(w5, x6);
     const auto g = c_mul_neon(w6, x7);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, b);
-    const auto b4 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, b);
-    const auto b5 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, b);
+    const auto b0 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, b);
+    const auto b1 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, b);
+    const auto b3 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, b);
+    const auto b4 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, b);
+    const auto b5 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, b);
 
-    const auto c0 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, c);
-    const auto c4 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, c);
-    const auto c5 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, c);
+    const auto c0 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, c);
+    const auto c1 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, c);
+    const auto c2 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, c);
+    const auto c4 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, c);
+    const auto c5 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, c);
 
-    const auto d0 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -kW7_2, +kW7_3 }, d);
-    const auto d4 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, d);
-    const auto d5 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, d);
+    const auto d0 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, d);
+    const auto d1 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-kW7_2, +kW7_3}, d);
+    const auto d4 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, d);
+    const auto d5 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, d);
 
-    const auto e0 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, e);
-    const auto e4 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, e);
-    const auto e5 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, e);
+    const auto e0 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, e);
+    const auto e1 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, e);
+    const auto e3 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, e);
+    const auto e4 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, e);
+    const auto e5 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, e);
 
-    const auto f0 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, f);
-    const auto f1 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, f);
-    const auto f2 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, f);
-    const auto f3 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, f);
-    const auto f4 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, f);
-    const auto f5 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, f);
+    const auto f0 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, f);
+    const auto f1 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, f);
+    const auto f2 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, f);
+    const auto f3 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, f);
+    const auto f4 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, f);
+    const auto f5 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, f);
 
-    const auto g0 = c_mul_neon(float32x2_t{ kW7_0, kW7_1 }, g);
-    const auto g1 = c_mul_neon(float32x2_t{ -kW7_2, kW7_3 }, g);
-    const auto g2 = c_mul_neon(float32x2_t{ -kW7_4, kW7_5 }, g);
-    const auto g3 = c_mul_neon(float32x2_t{ -kW7_4, -kW7_5 }, g);
-    const auto g4 = c_mul_neon(float32x2_t{ -kW7_2, -kW7_3 }, g);
-    const auto g5 = c_mul_neon(float32x2_t{ kW7_0, -kW7_1 }, g);
+    const auto g0 = c_mul_neon(float32x2_t{kW7_0, kW7_1}, g);
+    const auto g1 = c_mul_neon(float32x2_t{-kW7_2, kW7_3}, g);
+    const auto g2 = c_mul_neon(float32x2_t{-kW7_4, kW7_5}, g);
+    const auto g3 = c_mul_neon(float32x2_t{-kW7_4, -kW7_5}, g);
+    const auto g4 = c_mul_neon(float32x2_t{-kW7_2, -kW7_3}, g);
+    const auto g5 = c_mul_neon(float32x2_t{kW7_0, -kW7_1}, g);
 
     x1 = reduce_sum_7(a, b, c, d, e, f, g);
     x2 = reduce_sum_7(a, b0, c0, d0, e0, f0, g0);
@@ -271,9 +304,20 @@
     x7 = reduce_sum_7(a, b5, c5, d5, e5, f5, g5);
 }
 
-void fft_8(float32x2_t &x1, float32x2_t &x2, float32x2_t &x3, float32x2_t &x4, float32x2_t &x5, float32x2_t &x6, float32x2_t &x7, float32x2_t &x8, const float32x2_t &w, const float32x2_t &w2,
+void fft_8(float32x2_t       &x1,
+           float32x2_t       &x2,
+           float32x2_t       &x3,
+           float32x2_t       &x4,
+           float32x2_t       &x5,
+           float32x2_t       &x6,
+           float32x2_t       &x7,
+           float32x2_t       &x8,
+           const float32x2_t &w,
+           const float32x2_t &w2,
            const float32x2_t &w3,
-           const float32x2_t &w4, const float32x2_t &w5, const float32x2_t &w6,
+           const float32x2_t &w4,
+           const float32x2_t &w5,
+           const float32x2_t &w6,
            const float32x2_t &w7)
 {
     const auto a = x1;
@@ -285,61 +329,61 @@
     const auto g = c_mul_neon(w6, x7);
     const auto h = c_mul_neon(w7, x8);
 
-    const auto b0 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, b);
-    const auto b1 = c_mul_neon(float32x2_t{ 0, -1 }, b);
-    const auto b2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, b);
-    const auto b3 = c_mul_neon(float32x2_t{ -1, 0 }, b);
-    const auto b4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, b);
-    const auto b5 = c_mul_neon(float32x2_t{ 0, 1 }, b);
-    const auto b6 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, b);
+    const auto b0 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, b);
+    const auto b1 = c_mul_neon(float32x2_t{0, -1}, b);
+    const auto b2 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, b);
+    const auto b3 = c_mul_neon(float32x2_t{-1, 0}, b);
+    const auto b4 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, b);
+    const auto b5 = c_mul_neon(float32x2_t{0, 1}, b);
+    const auto b6 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, b);
 
-    const auto c0 = c_mul_neon(float32x2_t{ 0, -1 }, c);
-    const auto c1 = c_mul_neon(float32x2_t{ -1, 0 }, c);
-    const auto c2 = c_mul_neon(float32x2_t{ 0, 1 }, c);
-    const auto c3 = c_mul_neon(float32x2_t{ 1, 0 }, c);
-    const auto c4 = c_mul_neon(float32x2_t{ 0, -1 }, c);
-    const auto c5 = c_mul_neon(float32x2_t{ -1, 0 }, c);
-    const auto c6 = c_mul_neon(float32x2_t{ 0, 1 }, c);
+    const auto c0 = c_mul_neon(float32x2_t{0, -1}, c);
+    const auto c1 = c_mul_neon(float32x2_t{-1, 0}, c);
+    const auto c2 = c_mul_neon(float32x2_t{0, 1}, c);
+    const auto c3 = c_mul_neon(float32x2_t{1, 0}, c);
+    const auto c4 = c_mul_neon(float32x2_t{0, -1}, c);
+    const auto c5 = c_mul_neon(float32x2_t{-1, 0}, c);
+    const auto c6 = c_mul_neon(float32x2_t{0, 1}, c);
 
-    const auto d0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, d);
-    const auto d1 = c_mul_neon(float32x2_t{ 0, 1 }, d);
-    const auto d2 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, d);
-    const auto d3 = c_mul_neon(float32x2_t{ -1, 0 }, d);
-    const auto d4 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, d);
-    const auto d5 = c_mul_neon(float32x2_t{ 0, -1 }, d);
-    const auto d6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, d);
+    const auto d0 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, d);
+    const auto d1 = c_mul_neon(float32x2_t{0, 1}, d);
+    const auto d2 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, d);
+    const auto d3 = c_mul_neon(float32x2_t{-1, 0}, d);
+    const auto d4 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, d);
+    const auto d5 = c_mul_neon(float32x2_t{0, -1}, d);
+    const auto d6 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, d);
 
-    const auto e0 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e1 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e2 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e3 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e4 = c_mul_neon(float32x2_t{ -1, 0 }, e);
-    const auto e5 = c_mul_neon(float32x2_t{ 1, 0 }, e);
-    const auto e6 = c_mul_neon(float32x2_t{ -1, 0 }, e);
+    const auto e0 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e1 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e2 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e3 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e4 = c_mul_neon(float32x2_t{-1, 0}, e);
+    const auto e5 = c_mul_neon(float32x2_t{1, 0}, e);
+    const auto e6 = c_mul_neon(float32x2_t{-1, 0}, e);
 
-    const auto f0 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, f);
-    const auto f1 = c_mul_neon(float32x2_t{ 0, -1 }, f);
-    const auto f2 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, f);
-    const auto f3 = c_mul_neon(float32x2_t{ -1, 0 }, f);
-    const auto f4 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, f);
-    const auto f5 = c_mul_neon(float32x2_t{ 0, 1 }, f);
-    const auto f6 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, f);
+    const auto f0 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, f);
+    const auto f1 = c_mul_neon(float32x2_t{0, -1}, f);
+    const auto f2 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, f);
+    const auto f3 = c_mul_neon(float32x2_t{-1, 0}, f);
+    const auto f4 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, f);
+    const auto f5 = c_mul_neon(float32x2_t{0, 1}, f);
+    const auto f6 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, f);
 
-    const auto g0 = c_mul_neon(float32x2_t{ 0, 1 }, g);
-    const auto g1 = c_mul_neon(float32x2_t{ -1, 0 }, g);
-    const auto g2 = c_mul_neon(float32x2_t{ 0, -1 }, g);
-    const auto g3 = c_mul_neon(float32x2_t{ 1, 0 }, g);
-    const auto g4 = c_mul_neon(float32x2_t{ 0, 1 }, g);
-    const auto g5 = c_mul_neon(float32x2_t{ -1, 0 }, g);
-    const auto g6 = c_mul_neon(float32x2_t{ 0, -1 }, g);
+    const auto g0 = c_mul_neon(float32x2_t{0, 1}, g);
+    const auto g1 = c_mul_neon(float32x2_t{-1, 0}, g);
+    const auto g2 = c_mul_neon(float32x2_t{0, -1}, g);
+    const auto g3 = c_mul_neon(float32x2_t{1, 0}, g);
+    const auto g4 = c_mul_neon(float32x2_t{0, 1}, g);
+    const auto g5 = c_mul_neon(float32x2_t{-1, 0}, g);
+    const auto g6 = c_mul_neon(float32x2_t{0, -1}, g);
 
-    const auto h0 = c_mul_neon(float32x2_t{ kSqrt2Div2, kSqrt2Div2 }, h);
-    const auto h1 = c_mul_neon(float32x2_t{ 0, 1 }, h);
-    const auto h2 = c_mul_neon(float32x2_t{ -kSqrt2Div2, kSqrt2Div2 }, h);
-    const auto h3 = c_mul_neon(float32x2_t{ -1, 0 }, h);
-    const auto h4 = c_mul_neon(float32x2_t{ -kSqrt2Div2, -kSqrt2Div2 }, h);
-    const auto h5 = c_mul_neon(float32x2_t{ 0, -1 }, h);
-    const auto h6 = c_mul_neon(float32x2_t{ kSqrt2Div2, -kSqrt2Div2 }, h);
+    const auto h0 = c_mul_neon(float32x2_t{kSqrt2Div2, kSqrt2Div2}, h);
+    const auto h1 = c_mul_neon(float32x2_t{0, 1}, h);
+    const auto h2 = c_mul_neon(float32x2_t{-kSqrt2Div2, kSqrt2Div2}, h);
+    const auto h3 = c_mul_neon(float32x2_t{-1, 0}, h);
+    const auto h4 = c_mul_neon(float32x2_t{-kSqrt2Div2, -kSqrt2Div2}, h);
+    const auto h5 = c_mul_neon(float32x2_t{0, -1}, h);
+    const auto h6 = c_mul_neon(float32x2_t{kSqrt2Div2, -kSqrt2Div2}, h);
 
     x1 = reduce_sum_8(a, b, c, d, e, f, g, h);
     x2 = reduce_sum_8(a, b0, c0, d0, e0, f0, g0, h0);
@@ -352,18 +396,19 @@
 }
 
 template <bool first_stage>
-void fft_radix_2_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_2_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            auto a = float32x2_t{ 0, 0 };
-            auto b = float32x2_t{ 0, 0 };
+            auto a = float32x2_t{0, 0};
+            auto b = float32x2_t{0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 a             = wrapper::vgetlow(ab);
@@ -379,7 +424,7 @@
             fft_2(a, b, w);
 
             // Write outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
             }
@@ -394,12 +439,20 @@
     }
 }
 
-void fft_radix_2_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_2_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -418,20 +471,21 @@
 }
 
 template <bool first_stage>
-void fft_radix_3_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_3_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
             // Load inputs
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            if(first_stage)
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 a             = wrapper::vgetlow(ab);
@@ -447,7 +501,7 @@
             // Base-case prime transform
             fft_3(a, b, c, w, w2);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
             }
@@ -462,14 +516,22 @@
     }
 }
 
-void fft_radix_3_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_3_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -489,21 +551,22 @@
 }
 
 template <bool first_stage>
-void fft_radix_4_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_4_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
         const auto w3 = c_mul_neon(w2, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            if(first_stage)
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -524,7 +587,7 @@
             // Base-case prime transform
             fft_4(a, b, c, d, w, w2, w3);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -542,15 +605,23 @@
     }
 }
 
-void fft_radix_4_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_4_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const auto w2 = c_mul_neon(w, w);
         const auto w3 = c_mul_neon(w2, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -572,25 +643,26 @@
 }
 
 template <bool first_stage>
-void fft_radix_5_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_5_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
         const float32x2_t w4 = c_mul_neon(w3, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -613,7 +685,7 @@
             fft_5(a, b, c, d, e, w, w2, w3, w4);
 
             // Store outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -632,16 +704,24 @@
     }
 }
 
-void fft_radix_5_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_5_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
         const float32x2_t w4 = c_mul_neon(w3, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -666,10 +746,11 @@
 }
 
 template <bool first_stage>
-void fft_radix_7_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_7_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -677,18 +758,18 @@
         const float32x2_t w5 = c_mul_neon(w4, w);
         const float32x2_t w6 = c_mul_neon(w5, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
-            float32x2_t f = { 0, 0 };
-            float32x2_t g = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
+            float32x2_t f = {0, 0};
+            float32x2_t g = {0, 0};
 
             // Load inputs
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -715,7 +796,7 @@
             // Base-case prime transform
             fft_7(a, b, c, d, e, f, g, w, w2, w3, w4, w5, w6);
 
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -737,10 +818,18 @@
     }
 }
 
-void fft_radix_7_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_7_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -748,7 +837,7 @@
         const float32x2_t w5 = c_mul_neon(w4, w);
         const float32x2_t w6 = c_mul_neon(w5, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -777,10 +866,11 @@
 }
 
 template <bool first_stage>
-void fft_radix_8_axes_0(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
+void fft_radix_8_axes_0(
+    float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -789,20 +879,20 @@
         const float32x2_t w6 = c_mul_neon(w5, w);
         const float32x2_t w7 = c_mul_neon(w6, w);
 
-        for(unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * N; k += 2 * NxRadix)
         {
             // Load inputs
-            float32x2_t a = { 0, 0 };
-            float32x2_t b = { 0, 0 };
-            float32x2_t c = { 0, 0 };
-            float32x2_t d = { 0, 0 };
-            float32x2_t e = { 0, 0 };
-            float32x2_t f = { 0, 0 };
-            float32x2_t g = { 0, 0 };
-            float32x2_t h = { 0, 0 };
+            float32x2_t a = {0, 0};
+            float32x2_t b = {0, 0};
+            float32x2_t c = {0, 0};
+            float32x2_t d = {0, 0};
+            float32x2_t e = {0, 0};
+            float32x2_t f = {0, 0};
+            float32x2_t g = {0, 0};
+            float32x2_t h = {0, 0};
 
             // Base-case prime transform
-            if(first_stage)
+            if (first_stage)
             {
                 const auto ab = wrapper::vloadq(in + k);
                 const auto cd = wrapper::vloadq(in + k + 4 * Nx);
@@ -834,7 +924,7 @@
             fft_8(a, b, c, d, e, f, g, h, w, w2, w3, w4, w5, w6, w7);
 
             // Store outputs
-            if(first_stage)
+            if (first_stage)
             {
                 wrapper::vstore(out + k, wrapper::vcombine(a, b));
                 wrapper::vstore(out + k + 4 * Nx, wrapper::vcombine(c, d));
@@ -858,10 +948,18 @@
     }
 }
 
-void fft_radix_8_axes_1(float *out, float *in, unsigned int Nx, unsigned int NxRadix, const float32x2_t &w_m, unsigned int N, unsigned int M, unsigned int in_pad_x, unsigned int out_pad_x)
+void fft_radix_8_axes_1(float             *out,
+                        float             *in,
+                        unsigned int       Nx,
+                        unsigned int       NxRadix,
+                        const float32x2_t &w_m,
+                        unsigned int       N,
+                        unsigned int       M,
+                        unsigned int       in_pad_x,
+                        unsigned int       out_pad_x)
 {
-    float32x2_t w{ 1.0f, 0.0f };
-    for(unsigned int j = 0; j < Nx; j++)
+    float32x2_t w{1.0f, 0.0f};
+    for (unsigned int j = 0; j < Nx; j++)
     {
         const float32x2_t w2 = c_mul_neon(w, w);
         const float32x2_t w3 = c_mul_neon(w2, w);
@@ -870,7 +968,7 @@
         const float32x2_t w6 = c_mul_neon(w5, w);
         const float32x2_t w7 = c_mul_neon(w6, w);
 
-        for(unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
+        for (unsigned int k = 2 * j; k < 2 * M; k += 2 * NxRadix)
         {
             // Load inputs
             float32x2_t a = wrapper::vload(in + (N + in_pad_x) * k);
@@ -908,7 +1006,7 @@
     ARM_COMPUTE_UNUSED(config);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -917,11 +1015,12 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelInfo &config)
 {
     ARM_COMPUTE_UNUSED(config);
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output, *input);
     }
@@ -942,7 +1041,7 @@
     // FFT table axis 0: [radix, first_stage]
     static std::map<unsigned int, std::map<bool, FFTFunctionPointerAxis0>> fft_table_axis0;
 
-    if(fft_table_axis0.empty())
+    if (fft_table_axis0.empty())
     {
         fft_table_axis0[2][false] = &fft_radix_2_axes_0<false>;
         fft_table_axis0[3][false] = &fft_radix_3_axes_0<false>;
@@ -967,7 +1066,7 @@
     // FFT table axis 1: [radix, first_stage]
     static std::map<unsigned int, FFTFunctionPointerAxis1> fft_table_axis1;
 
-    if(fft_table_axis1.empty())
+    if (fft_table_axis1.empty())
     {
         fft_table_axis1[2] = &fft_radix_2_axes_1;
         fft_table_axis1[3] = &fft_radix_3_axes_1;
@@ -985,12 +1084,13 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
     // Output auto inizialitation if not yet initialized
-    if(output != nullptr)
+    if (output != nullptr)
     {
         auto_init_if_empty(*output->info(), *input->info()->clone());
     }
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config));
 
     _input  = input;
     _output = (output == nullptr) ? input : output;
@@ -998,7 +1098,7 @@
     _axis   = config.axis;
     _radix  = config.radix;
 
-    switch(config.axis)
+    switch (config.axis)
     {
         case 0:
             set_radix_stage_axis0(config);
@@ -1012,26 +1112,28 @@
     }
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
+    auto win_config =
+        validate_and_configure_window(input->info(), (output != nullptr) ? output->info() : nullptr, config);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NEFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelInfo &config)
+Status NEFFTRadixStageKernel::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *output,
+                                       const FFTRadixStageKernelInfo &config)
 {
     const bool run_in_place = (output == nullptr) || (output == input);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
-                                                              (run_in_place) ? nullptr : output->clone().get(),
-                                                              config)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (run_in_place) ? nullptr : output->clone().get(), config)
+            .first);
 
     return Status{};
 }
 
 std::set<unsigned int> NEFFTRadixStageKernel::supported_radix()
 {
-    return std::set<unsigned int> { 2, 3, 4, 5, 7, 8 };
+    return std::set<unsigned int>{2, 3, 4, 5, 7, 8};
 }
 
 void NEFFTRadixStageKernel::run(const Window &window, const ThreadInfo &info)
@@ -1049,28 +1151,32 @@
     // Precompute FFT constants
     const unsigned int NxRadix = _radix * _Nx;
     const float        alpha   = 2.0f * kPi / float(NxRadix);
-    const float32x2_t  w_m{ cosf(alpha), -sinf(alpha) };
+    const float32x2_t  w_m{cosf(alpha), -sinf(alpha)};
 
-    if(_axis == 0)
+    if (_axis == 0)
     {
         const unsigned int N = _input->info()->dimension(0);
-        execute_window_loop(input_window, [&](const Coordinates &)
-        {
-            _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N);
-        },
-        in, out);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &) {
+                _func_0(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m,
+                        N);
+            },
+            in, out);
     }
     else
     {
         const unsigned int N = _input->info()->dimension(0);
         const unsigned int M = _input->info()->dimension(1);
-        execute_window_loop(input_window, [&](const Coordinates &)
-        {
-            _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N, M,
-                    _input->info()->padding().right + _input->info()->padding().left,
-                    _output->info()->padding().right + _output->info()->padding().left);
-        },
-        in, out);
+        execute_window_loop(
+            input_window,
+            [&](const Coordinates &)
+            {
+                _func_1(reinterpret_cast<float *>(out.ptr()), reinterpret_cast<float *>(in.ptr()), _Nx, NxRadix, w_m, N,
+                        M, _input->info()->padding().right + _input->info()->padding().left,
+                        _output->info()->padding().right + _output->info()->padding().left);
+            },
+            in, out);
     }
 
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/core/NEON/kernels/NEFFTRadixStageKernel.h b/src/core/NEON/kernels/NEFFTRadixStageKernel.h
index 2291a10..54f32ef 100644
--- a/src/core/NEON/kernels/NEFFTRadixStageKernel.h
+++ b/src/core/NEON/kernels/NEFFTRadixStageKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEFFTRADIXSTAGEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 #include <arm_neon.h>
@@ -92,8 +93,17 @@
     void set_radix_stage_axis0(const FFTRadixStageKernelInfo &config);
     void set_radix_stage_axis1(const FFTRadixStageKernelInfo &config);
 
-    using FFTFunctionPointerAxis0 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
-    using FFTFunctionPointerAxis1 = std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int, unsigned int, unsigned int, unsigned int)>;
+    using FFTFunctionPointerAxis0 =
+        std::function<void(float *, float *, unsigned int, unsigned int, const float32x2_t &, unsigned int)>;
+    using FFTFunctionPointerAxis1 = std::function<void(float *,
+                                                       float *,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       const float32x2_t &,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       unsigned int,
+                                                       unsigned int)>;
 
     FFTFunctionPointerAxis0 _func_0;
     FFTFunctionPointerAxis1 _func_1;

diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.cpp b/src/core/NEON/kernels/NEFFTScaleKernel.cpp
index 5ec330b..9fe561f 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.cpp
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.cpp

@@ -28,9 +28,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -41,8 +42,8 @@
 void scale_complex(float *c_in, float *c_out, bool is_conjugate, float scale)
 {
     const auto a = wrapper::vload(c_in);
-    auto       b = wrapper::vdiv(a, float32x2_t{ scale, scale });
-    if(is_conjugate)
+    auto       b = wrapper::vdiv(a, float32x2_t{scale, scale});
+    if (is_conjugate)
     {
         const float img_part = wrapper::vgetlane(b, 1);
         b                    = wrapper::vsetlane(-img_part, b, 1);
@@ -56,7 +57,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -71,7 +72,7 @@
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps());
 
-    if(output != nullptr)
+    if (output != nullptr)
     {
         // Output auto inizialitation if not yet initialized
         auto_init_if_empty(*output, *input->clone());
@@ -126,10 +127,10 @@
     Iterator in(_input, input_window);
     Iterator out(_run_in_place ? _input : _output, input_window);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale);
-    },
-    in, out);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
+        { scale_complex(reinterpret_cast<float *>(in.ptr()), reinterpret_cast<float *>(out.ptr()), _is_conj, _scale); },
+        in, out);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEFFTScaleKernel.h b/src/core/NEON/kernels/NEFFTScaleKernel.h
index 24a19f9..608cf5e 100644
--- a/src/core/NEON/kernels/NEFFTScaleKernel.h
+++ b/src/core/NEON/kernels/NEFFTScaleKernel.h

@@ -24,10 +24,10 @@
 #ifndef ARM_COMPUTE_NEFFTSCALEKERNEL_H
 #define ARM_COMPUTE_NEFFTSCALEKERNEL_H
 
-#include "src/core/NEON/INEKernel.h"
-
 #include "arm_compute/core/KernelDescriptors.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 // Forward declarations

diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index 1c7c1f9..00b0c0a 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp

@@ -30,14 +30,19 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/kernels/NEFillBorderKernel.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
 {
 namespace
 {
-inline void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+inline void fill_constant_value_single_channel_special(ITensor          *tensor,
+                                                       const Window     &window,
+                                                       unsigned int      right,
+                                                       unsigned int      bottom,
+                                                       const PixelValue &constant_border_value)
 {
     float border_value;
     constant_border_value.get(border_value);
@@ -52,39 +57,43 @@
 
     Iterator vertical_it(tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
+        {
+            const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
 
-        // Fill left and right borders
-        *(row_start - 1) = border_value;
-        std::fill_n(row_start + width, right, border_value);
-    },
-    vertical_it);
+            // Fill left and right borders
+            *(row_start - 1) = border_value;
+            std::fill_n(row_start + width, right, border_value);
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
-        // Fill top rows including left/right borders
-        std::fill_n(row_start - 1, 1 + width + right, border_value);
-
-        // Bottom border
-        const unsigned low_border_size = height + bottom;
-        for(unsigned int i = height; i < low_border_size; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
-
-            // Fill bottom rows including left/right borders
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
+            // Fill top rows including left/right borders
             std::fill_n(row_start - 1, 1 + width + right, border_value);
-        }
-    },
-    plane_it);
+
+            // Bottom border
+            const unsigned low_border_size = height + bottom;
+            for (unsigned int i = height; i < low_border_size; ++i)
+            {
+                const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
+
+                // Fill bottom rows including left/right borders
+                std::fill_n(row_start - 1, 1 + width + right, border_value);
+            }
+        },
+        plane_it);
 }
 } // namespace
 
@@ -93,14 +102,20 @@
 {
 }
 
-void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensor          *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     _tensor = tensor;
     configure(tensor->info(), border_size, border_mode, constant_border_value);
 }
 
-void NEFillBorderKernel::configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorderKernel::configure(ITensorInfo      *tensor,
+                                   BorderSize        border_size,
+                                   BorderMode        border_mode,
+                                   const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -124,7 +139,7 @@
     ARM_COMPUTE_UNUSED(info);
 
     // If there is no border: early exit
-    if(_border_size.empty())
+    if (_border_size.empty())
     {
         return;
     }
@@ -132,13 +147,14 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_mode)
+    switch (_mode)
     {
         case BorderMode::CONSTANT:
         {
-            if(_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
+            if (_border_size.left == 1 && _border_size.top == 1 && _tensor->info()->data_type() == DataType::F32)
             {
-                fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+                fill_constant_value_single_channel_special(_tensor, window, _border_size.right, _border_size.bottom,
+                                                           _constant_border_value);
             }
             else
             {
@@ -176,46 +192,56 @@
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + vertical_it.offset();
-        // Fill left and right borders
-        for(unsigned int i = 0; i < _border_size.left; ++i)
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(), element_size);
-        }
+            uint8_t *base_addr = start_valid_region + vertical_it.offset();
+            // Fill left and right borders
+            for (unsigned int i = 0; i < _border_size.left; ++i)
+            {
+                std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, vertical_it.ptr(),
+                            element_size);
+            }
 
-        for(unsigned int i = 0; i < _border_size.right; ++i)
-        {
-            std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size, element_size);
-        }
-    },
-    vertical_it);
+            for (unsigned int i = 0; i < _border_size.right; ++i)
+            {
+                std::memcpy(base_addr + (width + i) * element_size, vertical_it.ptr() + (width - 1) * element_size,
+                            element_size);
+            }
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        for(int i = -_border_size.top; i < 0; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            // Copy top rows including left/right borders
-            std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) - _border_size.left * element_size,
-                        base_addr - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
-        }
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            for (int i = -_border_size.top; i < 0; ++i)
+            {
+                // Copy top rows including left/right borders
+                std::memcpy(base_addr + i * static_cast<int>(_tensor->info()->strides_in_bytes()[1]) -
+                                _border_size.left * element_size,
+                            base_addr - _border_size.left * element_size,
+                            (_border_size.left + width + _border_size.right) * element_size);
+            }
 
-        // Bottom border
-        for(unsigned int i = height; i < height + _border_size.bottom; ++i)
-        {
-            // Copy bottom rows including left/right borders
-            std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
-                        base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size, (_border_size.left + width + _border_size.right) * element_size);
-        }
-    },
-    plane_it);
+            // Bottom border
+            for (unsigned int i = height; i < height + _border_size.bottom; ++i)
+            {
+                // Copy bottom rows including left/right borders
+                std::memcpy(base_addr + i * _tensor->info()->strides_in_bytes()[1] - _border_size.left * element_size,
+                            base_addr + (height - 1) * _tensor->info()->strides_in_bytes()[1] -
+                                _border_size.left * element_size,
+                            (_border_size.left + width + _border_size.right) * element_size);
+            }
+        },
+        plane_it);
 }
 
 void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window)
@@ -232,50 +258,57 @@
 
     Iterator vertical_it(_tensor, vertical);
 
-    execute_window_loop(vertical, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + vertical_it.offset();
-        // Fill left and right borders
-        for(unsigned int i = 0; i < _border_size.left; ++i)
+    execute_window_loop(
+        vertical,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value, element_size);
-        }
+            uint8_t *base_addr = start_valid_region + vertical_it.offset();
+            // Fill left and right borders
+            for (unsigned int i = 0; i < _border_size.left; ++i)
+            {
+                std::memcpy(base_addr + static_cast<int>(i - _border_size.left) * element_size, &_constant_border_value,
+                            element_size);
+            }
 
-        for(unsigned int i = 0; i < _border_size.right; ++i)
-        {
-            std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
-        }
-    },
-    vertical_it);
+            for (unsigned int i = 0; i < _border_size.right; ++i)
+            {
+                std::memcpy(base_addr + (width + i) * element_size, &_constant_border_value, element_size);
+            }
+        },
+        vertical_it);
 
     // Top and bottom border
     Iterator plane_it(_tensor, window);
 
     // Iterate over all XY planes
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + plane_it.offset();
-        // Top border
-        for(int i = -_border_size.top; i < 0; ++i)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            // Fill top rows including left/right borders
-            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            uint8_t *base_addr = start_valid_region + plane_it.offset();
+            // Top border
+            for (int i = -_border_size.top; i < 0; ++i)
             {
-                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+                // Fill top rows including left/right borders
+                for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+                {
+                    std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+                                &_constant_border_value, element_size);
+                }
             }
-        }
 
-        // Bottom border
-        const unsigned low_border_size = height + _border_size.bottom;
-        for(unsigned int i = height; i < low_border_size; ++i)
-        {
-            // Fill bottom rows including left/right borders
-            for(unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+            // Bottom border
+            const unsigned low_border_size = height + _border_size.bottom;
+            for (unsigned int i = height; i < low_border_size; ++i)
             {
-                std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size, &_constant_border_value, element_size);
+                // Fill bottom rows including left/right borders
+                for (unsigned int j = 0; j < (_border_size.left + width + _border_size.right); ++j)
+                {
+                    std::memcpy(base_addr + i * stridey + static_cast<int>(j - _border_size.left) * element_size,
+                                &_constant_border_value, element_size);
+                }
             }
-        }
-    },
-    plane_it);
+        },
+        plane_it);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEFillBorderKernel.h b/src/core/NEON/kernels/NEFillBorderKernel.h
index 2c85158..aaad108 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.h
+++ b/src/core/NEON/kernels/NEFillBorderKernel.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,10 @@
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
      */
-    void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensor          *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
     /** Initialise the function.
      *
      * @note This kernel fills the borders within the XY-planes.
@@ -75,7 +79,10 @@
      * @param[in]     constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT.
      *
      */
-    void configure(ITensorInfo *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue());
+    void configure(ITensorInfo      *tensor,
+                   BorderSize        border_size,
+                   BorderMode        border_mode,
+                   const PixelValue &constant_border_value = PixelValue());
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
index 51a6904..cbe5136 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.cpp

@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
-#include "src/cpu/kernels/fuse_batch_normalization/list.h"
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
@@ -30,12 +29,14 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/fuse_batch_normalization/list.h"
 
 #include <map>
 
@@ -52,8 +53,16 @@
 };
 
 using FBNSelectorPtr = std::add_pointer<bool(const FuseBatchNormalizeSelectorData &data)>::type;
-using FBNUKernelPtr  = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, ITensor *,
-                                             const ITensor *, const ITensor *, const ITensor *, const ITensor *, float, const Window &)>::type;
+using FBNUKernelPtr  = std::add_pointer<void(const ITensor *,
+                                            const ITensor *,
+                                            ITensor *,
+                                            ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            const ITensor *,
+                                            float,
+                                            const Window &)>::type;
 
 struct FBNUKernel
 {
@@ -62,73 +71,63 @@
     FBNUKernelPtr        ukernel;
 };
 
-static const FBNUKernel available_kernels[] =
-{
-    {
-        "fused_batch_normalization_conv_NHWC_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
-    },
-    {
-        "fused_batch_normalization_conv_NCHW_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)
-    },
-    {
-        "fused_batch_normalization_dwc_NHWC_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)
-    },
-    {
-        "fused_batch_normalization_dwc_NCHW_F16",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)
-    },
-    {
-        "fused_batch_normalization_conv_NHWC_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
-    },
-    {
-        "fused_batch_normalization_conv_NCHW_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)
-    },
-    {
-        "fused_batch_normalization_dwc_NHWC_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NHWC && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)
-    },
-    {
-        "fused_batch_normalization_dwc_NCHW_F32",
-        [](const FuseBatchNormalizeSelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.dl == DataLayout::NCHW && data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)
-    }
-};
+static const FBNUKernel available_kernels[] = {
+    {"fused_batch_normalization_conv_NHWC_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+    {"fused_batch_normalization_conv_NCHW_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_conv_f16)},
+    {"fused_batch_normalization_dwc_NHWC_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NHWC && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f16)},
+    {"fused_batch_normalization_dwc_NCHW_F16",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16 &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f16)},
+    {"fused_batch_normalization_conv_NHWC_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+    {"fused_batch_normalization_conv_NCHW_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+                data.fbn_type == FuseBatchNormalizationType::CONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_conv_f32)},
+    {"fused_batch_normalization_dwc_NHWC_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NHWC &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nhwc_f32)},
+    {"fused_batch_normalization_dwc_NCHW_F32",
+     [](const FuseBatchNormalizeSelectorData &data)
+     {
+         return data.dt == DataType::F32 && data.dl == DataLayout::NCHW &&
+                data.fbn_type == FuseBatchNormalizationType::DEPTHWISECONVOLUTION;
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::fused_batch_normalization_dwc_nchw_f32)}};
 
 /** Micro-kernel selector
  *
@@ -140,9 +139,9 @@
  */
 const FBNUKernel *get_implementation(const FuseBatchNormalizeSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -150,10 +149,16 @@
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status validate_arguments(const ITensorInfo         *input_weights,
+                          const ITensorInfo         *bn_mean,
+                          const ITensorInfo         *bn_var,
+                          const ITensorInfo         *fused_weights,
+                          const ITensorInfo         *fused_bias,
+                          const ITensorInfo         *input_bias,
+                          const ITensorInfo         *bn_beta,
+                          const ITensorInfo         *bn_gamma,
+                          float                      epsilon,
+                          FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_UNUSED(epsilon);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
@@ -164,43 +169,44 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input_bias == nullptr && fused_bias == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(bn_mean->num_dimensions() > 1);
 
-    if(fbn_type == FuseBatchNormalizationType::CONVOLUTION)
+    if (fbn_type == FuseBatchNormalizationType::CONVOLUTION)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(3) != bn_mean->dimension(0));
     }
     else
     {
-        const size_t channel_idx = get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t channel_idx =
+            get_data_layout_dimension_index(input_weights->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(input_weights->dimension(channel_idx) != bn_mean->dimension(0));
     }
     // Validate bias
-    if(input_bias != nullptr)
+    if (input_bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, input_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, input_bias);
     }
     // Validate beta
-    if(bn_beta != nullptr)
+    if (bn_beta != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_beta);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_beta);
     }
     // Validate gamma
-    if(bn_gamma != nullptr)
+    if (bn_gamma != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, bn_gamma);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, bn_gamma);
     }
 
     // Validate output weights
-    if(fused_weights != nullptr && fused_weights->total_size() != 0)
+    if (fused_weights != nullptr && fused_weights->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input_weights, fused_weights);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_weights);
     }
     // Validate output bias
-    if(fused_bias != nullptr && fused_bias->total_size() != 0)
+    if (fused_bias != nullptr && fused_bias->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mean, fused_bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_weights, fused_bias);
@@ -212,15 +218,31 @@
 } // namespace
 
 NEFuseBatchNormalizationKernel::NEFuseBatchNormalizationKernel()
-    : _input_weights(nullptr), _input_bias(nullptr), _bn_mean(nullptr), _bn_var(nullptr), _bn_gamma(nullptr), _bn_beta(nullptr), _fused_weights(nullptr), _fused_bias(nullptr), _epsilon(),
-      _run_in_place_weights(false), _run_in_place_bias(false), _func(nullptr)
+    : _input_weights(nullptr),
+      _input_bias(nullptr),
+      _bn_mean(nullptr),
+      _bn_var(nullptr),
+      _bn_gamma(nullptr),
+      _bn_beta(nullptr),
+      _fused_weights(nullptr),
+      _fused_bias(nullptr),
+      _epsilon(),
+      _run_in_place_weights(false),
+      _run_in_place_bias(false),
+      _func(nullptr)
 {
 }
 
-void NEFuseBatchNormalizationKernel::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
-                                               ITensor *fused_weights, ITensor *fused_bias,
-                                               const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
-                                               float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalizationKernel::configure(const ITensor             *input_weights,
+                                               const ITensor             *bn_mean,
+                                               const ITensor             *bn_var,
+                                               ITensor                   *fused_weights,
+                                               ITensor                   *fused_bias,
+                                               const ITensor             *input_bias,
+                                               const ITensor             *bn_beta,
+                                               const ITensor             *bn_gamma,
+                                               float                      epsilon,
+                                               FuseBatchNormalizationType fbn_type)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_weights, bn_mean, bn_var);
 
@@ -238,27 +260,27 @@
     _run_in_place_bias    = (fused_bias == nullptr) || (input_bias != nullptr && fused_bias == input_bias);
 
     // Auto initialize outputs
-    if(_fused_weights != nullptr)
+    if (_fused_weights != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_weights->info(), *_input_weights->info()->clone());
     }
-    if(_fused_bias != nullptr)
+    if (_fused_bias != nullptr)
     {
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*_fused_bias->info(), *_bn_mean->info()->clone());
     }
 
     // Validate arguments
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_weights->info(), bn_mean->info(), bn_var->info(),
-                                                  (fused_weights != nullptr) ? fused_weights->info() : nullptr,
-                                                  (fused_bias != nullptr) ? fused_bias->info() : nullptr,
-                                                  (input_bias != nullptr) ? input_bias->info() : nullptr,
-                                                  (bn_beta != nullptr) ? bn_beta->info() : nullptr,
-                                                  (bn_gamma != nullptr) ? bn_gamma->info() : nullptr,
-                                                  epsilon, fbn_type));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_weights->info(), bn_mean->info(), bn_var->info(),
+        (fused_weights != nullptr) ? fused_weights->info() : nullptr,
+        (fused_bias != nullptr) ? fused_bias->info() : nullptr, (input_bias != nullptr) ? input_bias->info() : nullptr,
+        (bn_beta != nullptr) ? bn_beta->info() : nullptr, (bn_gamma != nullptr) ? bn_gamma->info() : nullptr, epsilon,
+        fbn_type));
 
-    const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{ input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa() });
+    const auto *uk = get_implementation(FuseBatchNormalizeSelectorData{
+        input_weights->info()->data_type(), input_weights->info()->data_layout(), fbn_type, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
     _func = uk->ukernel;
@@ -268,12 +290,19 @@
     INEKernel::configure(win);
 }
 
-Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                                const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                                const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                                float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalizationKernel::validate(const ITensorInfo         *input_weights,
+                                                const ITensorInfo         *bn_mean,
+                                                const ITensorInfo         *bn_var,
+                                                const ITensorInfo         *fused_weights,
+                                                const ITensorInfo         *fused_bias,
+                                                const ITensorInfo         *input_bias,
+                                                const ITensorInfo         *bn_beta,
+                                                const ITensorInfo         *bn_gamma,
+                                                float                      epsilon,
+                                                FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                   input_bias, bn_beta, bn_gamma, epsilon, fbn_type));
     return Status{};
 }
 
@@ -284,6 +313,7 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
-    (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon, window);
+    (*_func)(_input_weights, _input_bias, _fused_weights, _fused_bias, _bn_mean, _bn_var, _bn_beta, _bn_gamma, _epsilon,
+             window);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
index ee767b0..f23280d 100644
--- a/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h

@@ -66,9 +66,16 @@
      * @param[in]  epsilon       (Optional) Batch normalization layer epsilon parameter. Defaults to 0.001f.
      * @param[in]  fbn_type      (Optional) Fused batch normalization type. Defaults to CONVOLUTION.
      */
-    void configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var, ITensor *fused_weights, ITensor *fused_bias,
-                   const ITensor *input_bias = nullptr, const ITensor *bn_beta = nullptr, const ITensor *bn_gamma = nullptr,
-                   float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    void configure(const ITensor             *input_weights,
+                   const ITensor             *bn_mean,
+                   const ITensor             *bn_var,
+                   ITensor                   *fused_weights,
+                   ITensor                   *fused_bias,
+                   const ITensor             *input_bias = nullptr,
+                   const ITensor             *bn_beta    = nullptr,
+                   const ITensor             *bn_gamma   = nullptr,
+                   float                      epsilon    = 0.001f,
+                   FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
     /** Static function to check if given info will lead to a valid configuration of @ref NEFuseBatchNormalizationKernel
      *
      * @param[in] input_weights Input weights tensor info for convolution or depthwise convolution layer. Data type supported: F16/F32. Data layout supported: NCHW, NHWC
@@ -86,10 +93,16 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                           const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                           const ITensorInfo *input_bias = nullptr, const ITensorInfo *bn_beta = nullptr, const ITensorInfo *bn_gamma = nullptr,
-                           float epsilon = 0.001f, FuseBatchNormalizationType fbn_type = FuseBatchNormalizationType::CONVOLUTION);
+    static Status validate(const ITensorInfo         *input_weights,
+                           const ITensorInfo         *bn_mean,
+                           const ITensorInfo         *bn_var,
+                           const ITensorInfo         *fused_weights,
+                           const ITensorInfo         *fused_bias,
+                           const ITensorInfo         *input_bias = nullptr,
+                           const ITensorInfo         *bn_beta    = nullptr,
+                           const ITensorInfo         *bn_gamma   = nullptr,
+                           float                      epsilon    = 0.001f,
+                           FuseBatchNormalizationType fbn_type   = FuseBatchNormalizationType::CONVOLUTION);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -107,8 +120,16 @@
     bool           _run_in_place_weights;
     bool           _run_in_place_bias;
 
-    using FuseBatchNormFunction = void(const ITensor *input_weights, const ITensor *input_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                       const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window);
+    using FuseBatchNormFunction = void(const ITensor *input_weights,
+                                       const ITensor *input_bias,
+                                       ITensor       *fused_weights,
+                                       ITensor       *fused_bias,
+                                       const ITensor *bn_mean,
+                                       const ITensor *bn_var,
+                                       const ITensor *bn_beta,
+                                       const ITensor *bn_gamma,
+                                       float          epsilon,
+                                       const Window  &window);
 
     FuseBatchNormFunction *_func;
 };

diff --git a/src/core/NEON/kernels/NEGatherKernel.cpp b/src/core/NEON/kernels/NEGatherKernel.cpp
index 11332ff..f1d457d 100644
--- a/src/core/NEON/kernels/NEGatherKernel.cpp
+++ b/src/core/NEON/kernels/NEGatherKernel.cpp

@@ -27,9 +27,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -42,20 +43,22 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, indices, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(axis < 0)
+    if (axis < 0)
     {
         axis += input->num_dimensions();
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(0 > axis || axis >= static_cast<int32_t>(input->num_dimensions()));
-    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 > Coordinates::num_max_dimensions);
+    ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() + indices->num_dimensions() - 1 >
+                                Coordinates::num_max_dimensions);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
-        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->tensor_shape(), indices->tensor_shape(), axis);
+        TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+            input->tensor_shape(), indices->tensor_shape(), axis);
         ARM_COMPUTE_RETURN_ERROR_ON(output_shape.total_size() != output->tensor_shape().total_size());
     }
 
@@ -81,23 +84,23 @@
     const auto idx_info = _indices->info();
     const auto dst_info = _output->info();
 
-    const auto num_dims = dst_info->num_dimensions();
+    const auto num_dims     = dst_info->num_dimensions();
     const auto chunk_stride = src_info->strides_in_bytes()[_axis];
 
     const auto window_start_x = window.x().start();
-    const auto window_end_x = window.x().end();
-    auto window_size_x = src_info->element_size();
+    const auto window_end_x   = window.x().end();
+    auto       window_size_x  = src_info->element_size();
 
     const auto idx_limit = static_cast<TIndex>(src_info->tensor_shape()[_axis]);
 
-    if(_axis != 0)
+    if (_axis != 0)
     {
         dst_win.set(0, Window::Dimension(window_start_x, window_start_x + 1, 1));
         window_size_x *= window_end_x - window_start_x;
     }
 
     // Compute source and index tensors window based on the output window.
-    auto src_win = dst_win;
+    auto   src_win = dst_win;
     Window idx_win;
 
     for (size_t i = 0; i < idx_info->num_dimensions(); ++i)
@@ -109,22 +112,27 @@
     // Use the custom strides to access all three tensors using the same loop.
     Iterator src_it(num_dims, _src_it_strides, _input->buffer(), src_info->offset_first_element_in_bytes(), src_win);
     Iterator idx_it(num_dims, _idx_it_strides, _indices->buffer(), idx_info->offset_first_element_in_bytes(), idx_win);
-    Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(), dst_info->offset_first_element_in_bytes(), dst_win);
+    Iterator dst_it(num_dims, dst_info->strides_in_bytes(), _output->buffer(),
+                    dst_info->offset_first_element_in_bytes(), dst_win);
 
-    execute_window_loop(dst_win, [&](const Coordinates &) {
-        const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
-
-        if(idx >= 0 && idx < idx_limit)
+    execute_window_loop(
+        dst_win,
+        [&](const Coordinates &)
         {
-            const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+            const auto idx = *reinterpret_cast<const TIndex *>(idx_it.ptr());
 
-            std::copy_n(src_ptr, window_size_x, dst_it.ptr());
-        }
-        else
-        {
-            std::fill_n(dst_it.ptr(), window_size_x, 0);
-        }
-    }, src_it, idx_it, dst_it);
+            if (idx >= 0 && idx < idx_limit)
+            {
+                const auto src_ptr = src_it.ptr() + idx * chunk_stride;
+
+                std::copy_n(src_ptr, window_size_x, dst_it.ptr());
+            }
+            else
+            {
+                std::fill_n(dst_it.ptr(), window_size_x, 0);
+            }
+        },
+        src_it, idx_it, dst_it);
 }
 
 void NEGatherKernel::configure(const ITensor *input, const ITensor *indices, ITensor *output, int axis)
@@ -137,13 +145,13 @@
     _output  = output;
     _axis    = axis;
 
-    if(_axis < 0)
+    if (_axis < 0)
     {
         _axis += input->info()->num_dimensions();
     }
     ARM_COMPUTE_ERROR_ON(0 > _axis || _axis >= static_cast<int32_t>(input->info()->num_dimensions()));
 
-    switch(_indices->info()->data_type())
+    switch (_indices->info()->data_type())
     {
         case DataType::U32:
             _func = &NEGatherKernel::gather_common<uint32_t>;
@@ -157,7 +165,8 @@
     }
 
     // Output auto initialization if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_gather_shape(
+        input->info()->tensor_shape(), indices->info()->tensor_shape(), _axis);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     // Create window
@@ -169,30 +178,31 @@
     // These will be used to iterate lock-step through all tensors (input, indices and output).
     size_t dim_no = 0;
 
-    const auto input_info = input->info();
+    const auto  input_info    = input->info();
     const auto &input_strides = input_info->strides_in_bytes();
 
-    const auto indices_info = indices->info();
-    const auto &indices_strides = indices_info->strides_in_bytes();
-    const auto indices_num_dims = indices_info->num_dimensions();
+    const auto  indices_info     = indices->info();
+    const auto &indices_strides  = indices_info->strides_in_bytes();
+    const auto  indices_num_dims = indices_info->num_dimensions();
 
-    for(; dim_no < static_cast<size_t>(_axis); ++dim_no)
+    for (; dim_no < static_cast<size_t>(_axis); ++dim_no)
     {
         _src_it_strides[dim_no] = input_strides[dim_no];
     }
 
-    for(; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
+    for (; dim_no < static_cast<size_t>(_axis) + indices_num_dims; ++dim_no)
     {
         _idx_it_strides[dim_no] = indices_strides[dim_no - _axis];
     }
 
-    for(; dim_no < Coordinates::num_max_dimensions; ++dim_no)
+    for (; dim_no < Coordinates::num_max_dimensions; ++dim_no)
     {
         _src_it_strides[dim_no] = input_strides[dim_no - indices_num_dims + 1];
     }
 }
 
-Status NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
+Status
+NEGatherKernel::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, int axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, indices, output, axis));
     return Status{};

diff --git a/src/core/NEON/kernels/NEGatherKernel.h b/src/core/NEON/kernels/NEGatherKernel.h
index ce69dae..b8c069f 100644
--- a/src/core/NEON/kernels/NEGatherKernel.h
+++ b/src/core/NEON/kernels/NEGatherKernel.h

@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_NEGATHERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -92,8 +93,8 @@
     ITensor       *_output;
     kernel_ptr     _func;
 
-    Strides        _src_it_strides;
-    Strides        _idx_it_strides;
+    Strides _src_it_strides;
+    Strides _idx_it_strides;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEGATHERKERNEL_H */

diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
index 7bba136..549319e 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp

@@ -27,11 +27,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/genproposals/list.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -44,7 +46,8 @@
 };
 
 using ComputeAllAnchorsSelectorPtr = std::add_pointer<bool(const ComputeAllAnchorsData &data)>::type;
-using ComputeAllAnchorsUKernelPtr  = std::add_pointer<void(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
+using ComputeAllAnchorsUKernelPtr  = std::add_pointer<void(
+    const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)>::type;
 
 struct ComputeAllAnchorsKernel
 {
@@ -53,27 +56,17 @@
     ComputeAllAnchorsUKernelPtr        ukernel;
 };
 
-static const ComputeAllAnchorsKernel available_kernels[] =
-{
+static const ComputeAllAnchorsKernel available_kernels[] = {
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "neon_qu16_computeallanchors",
-        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)
-    },
+    {"neon_qu16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::QSYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qu16_computeallanchors)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "neon_fp16_computeallanchors",
-        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)
-    },
+    {"neon_fp16_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_computeallanchors)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "neon_fp32_computeallanchors",
-        [](const ComputeAllAnchorsData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)
-    },
+    {"neon_fp32_computeallanchors", [](const ComputeAllAnchorsData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_computeallanchors)},
 };
 
 /** Micro-kernel selector
@@ -84,9 +77,9 @@
  */
 const ComputeAllAnchorsKernel *get_implementation(const ComputeAllAnchorsData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -101,7 +94,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->dimension(0) != info.values_per_roi());
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(anchors, DataType::QSYMM16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(anchors->num_dimensions() > 2);
-    if(all_anchors->total_size() > 0)
+    if (all_anchors->total_size() > 0)
     {
         const size_t feature_height = info.feat_height();
         const size_t feature_width  = info.feat_width();
@@ -111,7 +104,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(0) != info.values_per_roi());
         ARM_COMPUTE_RETURN_ERROR_ON(all_anchors->dimension(1) != feature_height * feature_width * num_anchors);
 
-        if(is_data_type_quantized(anchors->data_type()))
+        if (is_data_type_quantized(anchors->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(anchors, all_anchors);
         }
@@ -139,7 +132,8 @@
 
     // Initialize the output if empty
     const TensorShape output_shape(info.values_per_roi(), width * height * num_anchors);
-    auto_init_if_empty(*all_anchors->info(), TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
+    auto_init_if_empty(*all_anchors->info(),
+                       TensorInfo(output_shape, 1, data_type, anchors->info()->quantization_info()));
 
     // Set instance variables
     _anchors      = anchors;
@@ -151,7 +145,9 @@
     INEKernel::configure(win);
 }
 
-Status NEComputeAllAnchorsKernel::validate(const ITensorInfo *anchors, const ITensorInfo *all_anchors, const ComputeAnchorsInfo &info)
+Status NEComputeAllAnchorsKernel::validate(const ITensorInfo        *anchors,
+                                           const ITensorInfo        *all_anchors,
+                                           const ComputeAnchorsInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(anchors, all_anchors, info));
     return Status{};
@@ -163,7 +159,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto *uk = get_implementation(ComputeAllAnchorsData{ _anchors->info()->data_type() });
+    const auto *uk = get_implementation(ComputeAllAnchorsData{_anchors->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_anchors, _all_anchors, _anchors_info, window);

diff --git a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
index 297d6d4..30699ee 100644
--- a/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h
+++ b/src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h

@@ -78,5 +78,5 @@
     ITensor           *_all_anchors;
     ComputeAnchorsInfo _anchors_info;
 };
-} // arm_compute
+} // namespace arm_compute
 #endif // ARM_COMPUTE_NEGENERATEPROPOSALSLAYERKERNEL_H

diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
index 7164140..0a1780f 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp

@@ -31,12 +31,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/instancenorm/list.h"
 
 #include <arm_neon.h>
@@ -51,7 +52,13 @@
 };
 
 using InstanceNormSelctorPtr = std::add_pointer<bool(const InstanceNormSelectorData &data)>::type;
-using InstanceNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)>::type;
+using InstanceNormUKernelPtr = std::add_pointer<void(ITensor      *input,
+                                                     ITensor      *output,
+                                                     float         gamma,
+                                                     float         beta,
+                                                     float         epsilon,
+                                                     bool          use_mixed_precision,
+                                                     const Window &window)>::type;
 
 struct InstanceNormKernel
 {
@@ -60,19 +67,12 @@
     InstanceNormUKernelPtr       ukernel;
 };
 
-static const InstanceNormKernel available_kernels[] =
-{
-    {
-        "fp32_neon_instancenorm",
-        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)
-    },
+static const InstanceNormKernel available_kernels[] = {
+    {"fp32_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_instancenorm)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_instancenorm",
-        [](const InstanceNormSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)
-    },
+    {"fp16_neon_instancenorm", [](const InstanceNormSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_instancenorm)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 };
 
@@ -84,9 +84,9 @@
  */
 const InstanceNormKernel *get_implementation(const InstanceNormSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -102,14 +102,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(epsilon == 0.f, "Epsilon must be different than 0");
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC, "NHWC data layout is not supported by the kernel directly");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() == DataLayout::NHWC,
+                                    "NHWC data layout is not supported by the kernel directly");
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(), "Input and output have different number of channels");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_channels() != output->num_channels(),
+                                        "Input and output have different number of channels");
     }
     return Status{};
 }
@@ -132,7 +134,9 @@
 {
 }
 
-void NEInstanceNormalizationLayerKernel::configure(ITensor *input, ITensor *output, const InstanceNormalizationLayerKernelInfo &info)
+void NEInstanceNormalizationLayerKernel::configure(ITensor                                    *input,
+                                                   ITensor                                    *output,
+                                                   const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -152,10 +156,13 @@
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info)
+Status NEInstanceNormalizationLayerKernel::validate(const ITensorInfo                          *input,
+                                                    const ITensorInfo                          *output,
+                                                    const InstanceNormalizationLayerKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, info.gamma, info.beta, info.epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
+    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(
+        input->clone().get(), (output == nullptr ? input->clone().get() : output->clone().get()))));
     return Status{};
 }
 
@@ -165,7 +172,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    const auto *uk = get_implementation(InstanceNormSelectorData{ _input->info()->data_type() });
+    const auto *uk = get_implementation(InstanceNormSelectorData{_input->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_input, _output, _gamma, _beta, _epsilon, _use_mixed_precision, window);

diff --git a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
index f166ce2..024ccd9 100644
--- a/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h

@@ -68,7 +68,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const InstanceNormalizationLayerKernelInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -82,14 +83,15 @@
      * @param[in]      beta    The offset scalar value applied to the normalized tensor. Defaults to 0.0
      * @param[in]      epsilon Lower bound value for the normalization. Defaults to 1e-12
      */
-    using NormalizationFunction = void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+    using NormalizationFunction =
+        void(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 
     ITensor *_input;
     ITensor *_output;
     float    _gamma;
     float    _beta;
     float    _epsilon;
-    bool     _use_mixed_precision{ true };
+    bool     _use_mixed_precision{true};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEINSTANCENORMALIZATIONLAYERKERNEL_H */

diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
index 8ab0288..eea57a1 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp

@@ -30,11 +30,12 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/cpuinfo/CpuIsaInfo.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/cpu/kernels/l2normlayer/list.h"
 
 #include <arm_neon.h>
@@ -55,7 +56,8 @@
 
 using L2NormalizeLayerKernelSelctorPtr = std::add_pointer<bool(const L2NormalizeLayerSelectorData &data)>::type;
 
-using L2NormalizeLayerPtr = std::add_pointer<void(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
+using L2NormalizeLayerPtr = std::add_pointer<void(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)>::type;
 
 struct L2NormalizeLayerKernel
 {
@@ -64,26 +66,25 @@
     L2NormalizeLayerPtr                    ukernel;
 };
 
-static const L2NormalizeLayerKernel available_kernels[] =
-{
-    {
-        "fp32_neon_l2normalize_x",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)
-    },
-    {
-        "fp32_neon_l2normalize_yz",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)
-    },
+static const L2NormalizeLayerKernel available_kernels[] = {
+    {"fp32_neon_l2normalize_x",
+     [](const L2NormalizeLayerSelectorData &data)
+     { return data.dt == DataType::F32 && data.actual_axis == Window::DimX; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_x)},
+    {"fp32_neon_l2normalize_yz",
+     [](const L2NormalizeLayerSelectorData &data)
+     { return data.dt == DataType::F32 && data.actual_axis != Window::DimX; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_l2_normalize_yz)},
     {
         "fp16_neon_l2normalize_x",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
+        [](const L2NormalizeLayerSelectorData &data)
+        { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis == Window::DimX; },
         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_x),
     },
     {
         "fp16_neon_l2normalize_yz",
-        [](const L2NormalizeLayerSelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
+        [](const L2NormalizeLayerSelectorData &data)
+        { return data.dt == DataType::F16 && data.isa.fp16 && data.actual_axis != Window::DimX; },
         REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_l2_normalize_yz),
     },
 };
@@ -96,9 +97,9 @@
  */
 const L2NormalizeLayerKernel *get_implementation(const L2NormalizeLayerSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -106,7 +107,8 @@
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_UNUSED(epsilon);
 
@@ -115,14 +117,15 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, sum);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis > 2, "Actual axis greater than 2 is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions, "Actual normalization axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(actual_axis >= TensorShape::num_max_dimensions,
+                                    "Actual normalization axis greater than max number of dimensions");
 
     // Reduce shape on axis
     TensorShape sum_shape = input->tensor_shape();
     sum_shape.set(actual_axis, 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(sum->tensor_shape(), sum_shape);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -151,7 +154,8 @@
 {
 }
 
-void NEL2NormalizeLayerKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
+void NEL2NormalizeLayerKernel::configure(
+    const ITensor *input, const ITensor *sum, ITensor *output, int axis, float epsilon)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, sum, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), sum->info(), output->info(), axis, epsilon));
@@ -169,10 +173,12 @@
     INEKernel::configure(std::get<1>(win_config));
 }
 
-Status NEL2NormalizeLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
+Status NEL2NormalizeLayerKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, sum, output, axis, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        std::get<0>(validate_and_configure_window(input->clone().get(), output->clone().get())));
 
     return Status{};
 }
@@ -183,12 +189,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_actual_axis > 2)
+    if (_actual_axis > 2)
     {
         ARM_COMPUTE_ERROR("Unsupported normalization axis");
     }
 
-    const auto *uk = get_implementation(L2NormalizeLayerSelectorData{ _output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa() });
+    const auto *uk = get_implementation(
+        L2NormalizeLayerSelectorData{_output->info()->data_type(), _actual_axis, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
 

diff --git a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
index af3ad34..3524e66 100644
--- a/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h
+++ b/src/core/NEON/kernels/NEL2NormalizeLayerKernel.h

@@ -74,7 +74,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *sum, const ITensorInfo *output, int axis, float epsilon);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NELogicalKernel.cpp b/src/core/NEON/kernels/NELogicalKernel.cpp
index 6939e08..6be6284 100644
--- a/src/core/NEON/kernels/NELogicalKernel.cpp
+++ b/src/core/NEON/kernels/NELogicalKernel.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,7 +51,7 @@
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
         src0 += step;
@@ -58,7 +59,7 @@
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
         src0 += half_step;
@@ -66,7 +67,7 @@
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src0) && (*src1);
         ++src0;
@@ -84,21 +85,21 @@
     const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
     const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src) && broadcast_val_clamped_s;
         ++src;
@@ -112,7 +113,7 @@
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src1);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
         src0 += step;
@@ -120,7 +121,7 @@
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
         src0 += half_step;
@@ -128,7 +129,7 @@
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src0) || (*src1);
         ++src0;
@@ -146,21 +147,21 @@
     const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
     const auto broadcast_val_clamped_x8  = vdup_n_u8(broadcast_val_clamped_s);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = (*src) || broadcast_val_clamped_s;
         ++src;
@@ -173,21 +174,21 @@
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(src);
     ARM_COMPUTE_ASSERT_NOT_NULLPTR(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
         src += step;
         dst += step;
     }
 
-    for(; len >= half_step; len -= half_step)
+    for (; len >= half_step; len -= half_step)
     {
         vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
         src += half_step;
         dst += half_step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *dst = !(*src);
         ++src;
@@ -197,18 +198,15 @@
 
 void run_unary(const Window &window, const ITensor *src, ITensor *dst)
 {
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     const auto len = window.x().end() - window.x().start();
 
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        neon_logical_not(in.ptr(), out.ptr(), len);
-    },
-    in, out);
+    execute_window_loop(
+        win, [&](const Coordinates &) { neon_logical_not(in.ptr(), out.ptr(), len); }, in, out);
 }
 
 void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op)
@@ -216,16 +214,17 @@
     Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
     Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
     const auto len                   = window.x().end() - window.x().start();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
-        using LogicalBroadcastUKernelPtr        = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
-        LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
+        using LogicalBroadcastUKernelPtr = std::add_pointer<void(const uint8_t *, uint8_t, uint8_t *, uint32_t)>::type;
+        LogicalBroadcastUKernelPtr logical_func =
+            op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
 
         const bool     is_broadcast_input_1 = src1_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_1 ? src1_win : src0_win;
@@ -238,17 +237,18 @@
         Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
         Iterator out(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const uint8_t broadcast_value = *broadcast_in.ptr();
-            logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
-
-        },
-        broadcast_in, non_broadcast_in, out);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const uint8_t broadcast_value = *broadcast_in.ptr();
+                logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
+            },
+            broadcast_in, non_broadcast_in, out);
     }
     else
     {
-        using LogicalUKernelPtr        = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
+        using LogicalUKernelPtr = std::add_pointer<void(const uint8_t *, const uint8_t *, uint8_t *, uint32_t)>::type;
         LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
 
         src0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -257,11 +257,8 @@
         Iterator in0(src0, src0_win);
         Iterator in1(src1, src1_win);
         Iterator out(dst, win);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            logical_func(in0.ptr(), in1.ptr(), out.ptr(), len);
-        },
-        in0, in1, out);
+        execute_window_loop(
+            win, [&](const Coordinates &) { logical_func(in0.ptr(), in1.ptr(), out.ptr(), len); }, in0, in1, out);
     }
 }
 } // namespace
@@ -270,7 +267,10 @@
     return "NELogicalKernel";
 }
 
-void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
+void NELogicalKernel::configure(const ITensorInfo *input1,
+                                const ITensorInfo *input2,
+                                ITensorInfo       *output,
+                                LogicalOperation   op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op));
@@ -279,7 +279,7 @@
 
     Window      win       = calculate_max_window(*input1, Steps());
     TensorShape out_shape = input1->tensor_shape();
-    if(op != LogicalOperation::Not)
+    if (op != LogicalOperation::Not)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input2);
         out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
@@ -292,13 +292,16 @@
     set_data_type_if_unknown(*output, input1->data_type());
 }
 
-Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
+Status NELogicalKernel::validate(const ITensorInfo *input1,
+                                 const ITensorInfo *input2,
+                                 const ITensorInfo *output,
+                                 LogicalOperation   op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON(op == LogicalOperation::Unknown);
 
     TensorShape out_shape = input1->tensor_shape();
-    if(op != LogicalOperation::Not)
+    if (op != LogicalOperation::Not)
     {
         out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
@@ -306,7 +309,7 @@
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(detail::have_different_dimensions(out_shape, output->tensor_shape(), 0));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -326,7 +329,7 @@
     const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     ITensor       *dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_op == LogicalOperation::Not)
+    if (_op == LogicalOperation::Not)
     {
         run_unary(window, src0, dst);
     }

diff --git a/src/core/NEON/kernels/NELogicalKernel.h b/src/core/NEON/kernels/NELogicalKernel.h
index caf69cf..477a59d 100644
--- a/src/core/NEON/kernels/NELogicalKernel.h
+++ b/src/core/NEON/kernels/NELogicalKernel.h

@@ -58,10 +58,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
+    static Status
+    validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:

diff --git a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
index 37e88a8..451031d 100644
--- a/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.cpp

@@ -28,12 +28,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/meanstddevnorm/list.h"
 
 namespace arm_compute
@@ -46,7 +47,8 @@
 };
 
 using MeanStdDevNormSelctorPtr = std::add_pointer<bool(const MeanStdDevNormSelectorData &data)>::type;
-using MeanStdDevNormUKernelPtr = std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
+using MeanStdDevNormUKernelPtr =
+    std::add_pointer<void(ITensor *input, ITensor *output, float epsilon, const Window &window)>::type;
 
 struct MeanStdDevNormKernel
 {
@@ -55,25 +57,15 @@
     MeanStdDevNormUKernelPtr       ukernel;
 };
 
-static const std::vector<MeanStdDevNormKernel> available_kernels =
-{
-    {
-        "fp32_neon_meanstddevnorm",
-        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)
-    },
+static const std::vector<MeanStdDevNormKernel> available_kernels = {
+    {"fp32_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_meanstddevnorm)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_meanstddevnorm",
-        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)
-    },
+    {"fp16_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_meanstddevnorm)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "qasymm8_neon_meanstddevnorm",
-        [](const MeanStdDevNormSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)
-    },
+    {"qasymm8_neon_meanstddevnorm", [](const MeanStdDevNormSelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_meanstddevnorm)},
 };
 
 /** Micro-kernel selector
@@ -84,9 +76,9 @@
  */
 const MeanStdDevNormKernel *get_implementation(const MeanStdDevNormSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -103,7 +95,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8);
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -113,7 +105,7 @@
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output)
 {
-    if(output != nullptr)
+    if (output != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
         // Output auto inizialitation if not yet initialized
@@ -128,8 +120,7 @@
 }
 } // namespace
 
-NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel()
-    : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
+NEMeanStdDevNormalizationKernel::NEMeanStdDevNormalizationKernel() : _input(nullptr), _output(nullptr), _epsilon(1e-8f)
 {
 }
 
@@ -137,7 +128,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
-    ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
+    ARM_COMPUTE_ERROR_THROW_ON(NEMeanStdDevNormalizationKernel::validate(
+        input->info(), (output != nullptr) ? output->info() : nullptr, epsilon));
 
     _input   = input;
     _output  = (output == nullptr) ? input : output;
@@ -152,7 +144,9 @@
 Status NEMeanStdDevNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, float epsilon)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, epsilon));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), (output != nullptr) ? output->clone().get() : nullptr)
+            .first);
     return Status{};
 }
 
@@ -162,7 +156,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const auto *uk = get_implementation(MeanStdDevNormSelectorData{ _output->info()->data_type() });
+    const auto *uk = get_implementation(MeanStdDevNormSelectorData{_output->info()->data_type()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     uk->ukernel(_input, _output, _epsilon, window);

diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
index 49a0453..2c61bda 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp

@@ -29,19 +29,23 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
-#include "src/core/NEON/NEMath.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/NormalizationHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status validate_arguments(const ITensorInfo            *input,
+                          const ITensorInfo            *input_squared,
+                          const ITensorInfo            *output,
+                          const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_squared, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
@@ -52,7 +56,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -69,7 +73,10 @@
 {
 }
 
-void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info)
+void NENormalizationLayerKernel::configure(const ITensor         *input,
+                                           const ITensor         *input_squared,
+                                           ITensor               *output,
+                                           NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_squared, output);
     // Output tensor auto initialization if not yet initialized
@@ -85,15 +92,15 @@
     _output        = output;
     _norm_info     = norm_info;
 
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
         case DataType::F32:
         {
-            switch(norm_idx)
+            switch (norm_idx)
             {
                 case 0:
                 {
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float, 4, 0, true>;
                     }
@@ -104,7 +111,7 @@
                     break;
                 }
                 case 1:
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float, 4, 1, true>;
                     }
@@ -124,11 +131,11 @@
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
         {
-            switch(norm_idx)
+            switch (norm_idx)
             {
                 case 0:
                 {
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 0, true>;
                     }
@@ -139,7 +146,7 @@
                     break;
                 }
                 case 1:
-                    if(norm_info.type() == NormType::IN_MAP_2D)
+                    if (norm_info.type() == NormType::IN_MAP_2D)
                     {
                         _func = &NENormalizationLayerKernel::normalize_float<float16_t, 8, 1, true>;
                     }
@@ -196,8 +203,9 @@
     const auto beta_vec  = wrapper::vdup_n(static_cast<T>(_norm_info.beta()), ExactTagType{});
     const auto kappa_vec = wrapper::vdup_n(static_cast<T>(_norm_info.kappa()), ExactTagType{});
 
-    auto sequential_normalization = [&](const int x, const Coordinates & id, const int current_row, const int first_row, const int last_row, const T * input_ptr, const uint8_t *input_squared_start_ptr,
-                                        T * output_ptr)
+    auto sequential_normalization = [&](const int x, const Coordinates &id, const int current_row, const int first_row,
+                                        const int last_row, const T *input_ptr, const uint8_t *input_squared_start_ptr,
+                                        T *output_ptr)
     {
         const int current_slice = dim == 0 ? x : id[dim];
         const int first_slice   = std::max(current_slice - radius, 0);
@@ -206,75 +214,87 @@
         const uint8_t *const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
         // Accumulate 2D In-Map values
         auto accu = static_cast<T>(0.f);
-        for(int j = first_row; j <= last_row; ++j)
+        for (int j = first_row; j <= last_row; ++j)
         {
             // Compute row displacement
             const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-            for(int i = first_slice; i <= last_slice; ++i)
+            for (int i = first_slice; i <= last_slice; ++i)
             {
-                accu += *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
+                accu +=
+                    *reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
             }
         }
 
         // Normalize
-        const auto normalized       = std::pow(accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
+        const auto normalized = std::pow(
+            accu * static_cast<T>(_norm_info.scale_coeff()) + static_cast<T>(_norm_info.kappa()), _norm_info.beta());
         const auto normalized_pixel = (*(input_ptr + x)) / normalized;
         *(output_ptr + x)           = normalized_pixel;
     };
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto       output_ptr = reinterpret_cast<T *>(output.ptr());
-
-        // Get range to normalize
-        const int current_row = do_2D_norm ? id[dim_y] : 0;
-        const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
-        const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
-
-        int x = window_start_x;
-        // Compute serially starting elements for the case x dimension is width
-        for(; x < radius && x < window_end_x && dim == 0; ++x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
-        }
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<T *>(output.ptr());
 
-        // Compute vectorized
-        for(; x <= window_end_x - window_step_x - radius; x += window_step_x)
-        {
-            const int current_slice = dim == 0 ? x : id[dim];
-            const int first_slice   = std::max(current_slice - radius, 0);
-            const int last_slice    = std::min(current_slice + radius, max_right);
+            // Get range to normalize
+            const int current_row = do_2D_norm ? id[dim_y] : 0;
+            const int first_row   = do_2D_norm ? std::max(current_row - radius, 0) : 0;
+            const int last_row    = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
 
-            const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
-            // Accumulate 2D In-Map values
-            auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-            for(int j = first_row; j <= last_row; ++j)
+            int x = window_start_x;
+            // Compute serially starting elements for the case x dimension is width
+            for (; x < radius && x < window_end_x && dim == 0; ++x)
             {
-                // Compute row displacement
-                const uint8_t *const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
-                for(int i = first_slice; i <= last_slice; ++i)
-                {
-                    accu = wrapper::vadd(accu, wrapper::vloadq(reinterpret_cast<const T *>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
-                }
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
             }
 
-            // Normalize
-            const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
-            const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
-            wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
-        }
+            // Compute vectorized
+            for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
+            {
+                const int current_slice = dim == 0 ? x : id[dim];
+                const int first_slice   = std::max(current_slice - radius, 0);
+                const int last_slice    = std::min(current_slice + radius, max_right);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(), output_ptr);
-        }
-    },
-    input, input_squared, output);
+                const uint8_t *const input_squared_x_ptr = input_squared.ptr() + x * input_squared_stride_x;
+                // Accumulate 2D In-Map values
+                auto accu = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                for (int j = first_row; j <= last_row; ++j)
+                {
+                    // Compute row displacement
+                    const uint8_t *const input_squared_ptr =
+                        input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
+                    for (int i = first_slice; i <= last_slice; ++i)
+                    {
+                        accu = wrapper::vadd(
+                            accu, wrapper::vloadq(reinterpret_cast<const T *>(
+                                      input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
+                    }
+                }
+
+                // Normalize
+                const auto normalized       = wrapper::vpow(wrapper::vmla(kappa_vec, coeff_vec, accu), beta_vec);
+                const auto normalized_pixel = wrapper::vmul(wrapper::vloadq(input_ptr + x), wrapper::vinv(normalized));
+                wrapper::vstore(reinterpret_cast<T *>(output_ptr + x), normalized_pixel);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                sequential_normalization(x, id, current_row, first_row, last_row, input_ptr, input_squared.ptr(),
+                                         output_ptr);
+            }
+        },
+        input, input_squared, output);
 }
 
-Status NENormalizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, const NormalizationLayerInfo norm_info)
+Status NENormalizationLayerKernel::validate(const ITensorInfo           *input,
+                                            const ITensorInfo           *input_squared,
+                                            const ITensorInfo           *output,
+                                            const NormalizationLayerInfo norm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, input_squared, output, norm_info));
 

diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.h b/src/core/NEON/kernels/NENormalizationLayerKernel.h
index 53a06b9..2d8d9f3 100644
--- a/src/core/NEON/kernels/NENormalizationLayerKernel.h
+++ b/src/core/NEON/kernels/NENormalizationLayerKernel.h

@@ -60,7 +60,8 @@
      * @param[out] output        Destination tensor. Output will have the same number of dimensions as input. Data type and layout supported: same as @p input.
      * @param[in]  norm_info     Normalization layer information like the normalization type, normalization size and other parameters.
      */
-    void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
+    void
+    configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info);
     /** Static function to check if given info will lead to a valid configuration of @ref NENormalizationLayerKernel
      *
      * @param[in] input         Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM],
@@ -72,7 +73,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *input_squared, const ITensorInfo *output, NormalizationLayerInfo norm_info);
+    static Status validate(const ITensorInfo     *input,
+                           const ITensorInfo     *input_squared,
+                           const ITensorInfo     *output,
+                           NormalizationLayerInfo norm_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEPadLayerKernel.cpp b/src/core/NEON/kernels/NEPadLayerKernel.cpp
index 734510b..c9bcbc9 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPadLayerKernel.cpp

@@ -28,26 +28,31 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &paddings, const PaddingMode mode)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const PaddingList &paddings,
+                          const PaddingMode  mode)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(mode != PaddingMode::CONSTANT, "Only constant padding mode is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(paddings.size() > 4, "Padding list bigger than 4 dimensions");
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
-        const TensorInfo  expected_output_info  = input->clone()->set_tensor_shape(expected_output_shape);
+        const TensorShape expected_output_shape =
+            arm_compute::misc::shape_calculator::compute_padded_shape(input->tensor_shape(), paddings);
+        const TensorInfo expected_output_info = input->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &expected_output_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -58,30 +63,34 @@
 template <typename T>
 void NEPadLayerKernel::run_pad_constant(const Window &window)
 {
-    Window output_window{ window };
+    Window output_window{window};
     output_window.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     const size_t element_size = _input->info()->element_size();
     Iterator     output_it(_output, output_window);
-    execute_window_loop(output_window, [&](const Coordinates & id)
-    {
-        Coordinates idin{ id };
-        for(size_t dim = _padding.size() - 1; dim > 0; --dim)
+    execute_window_loop(
+        output_window,
+        [&](const Coordinates &id)
         {
-            idin[dim] -= _padding[dim].first;
-            if(idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+            Coordinates idin{id};
+            for (size_t dim = _padding.size() - 1; dim > 0; --dim)
             {
-                std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0), _constant_value.get<T>());
-                return;
+                idin[dim] -= _padding[dim].first;
+                if (idin[dim] < 0 || static_cast<int>(_input->info()->dimension(dim)) - 1 < idin[dim])
+                {
+                    std::fill_n(reinterpret_cast<T *>(output_it.ptr()), _output->info()->dimension(0),
+                                _constant_value.get<T>());
+                    return;
+                }
             }
-        }
-        T *input_it_ptr  = reinterpret_cast<T *>(_input->ptr_to_element(idin));
-        T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
-        std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
-        memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
-        std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second, _constant_value.get<T>());
-    },
-    output_it);
+            T *input_it_ptr  = reinterpret_cast<T *>(_input->ptr_to_element(idin));
+            T *output_it_ptr = reinterpret_cast<T *>(output_it.ptr());
+            std::fill_n(output_it_ptr, _padding[0].first, _constant_value.get<T>());
+            memcpy(output_it_ptr + _padding[0].first, input_it_ptr, _input->info()->dimension(0) * element_size);
+            std::fill_n(output_it_ptr + _padding[0].first + _input->info()->dimension(0), _padding[0].second,
+                        _constant_value.get<T>());
+        },
+        output_it);
 }
 
 void NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad(const Window &window)
@@ -92,7 +101,7 @@
     const size_t end_plane   = window.z().end();
 
     size_t start_plane_input = start_plane;
-    if(_padding.size() > 2)
+    if (_padding.size() > 2)
     {
         start_plane_input = (start_plane < _padding[2].first) ? 0 : start_plane - _padding[2].first;
     }
@@ -105,18 +114,20 @@
     const size_t jump_to_next_row_input  = _input->info()->dimension(0);
     const size_t jump_to_next_row_output = _padding[0].first + _padding[0].second;
 
-    uint8_t       *output_row_ptr = _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
-    const uint8_t *input_it_ptr   = _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
-    const auto     pad_value      = _constant_value.get<uint8_t>();
+    uint8_t *output_row_ptr =
+        _output->buffer() + _output->info()->offset_first_element_in_bytes() + start_plane * output_plane_size;
+    const uint8_t *input_it_ptr =
+        _input->buffer() + _input->info()->offset_first_element_in_bytes() + start_plane_input * input_plane_size;
+    const auto pad_value = _constant_value.get<uint8_t>();
 
-    for(size_t z_i = start_plane; z_i < end_plane; ++z_i)
+    for (size_t z_i = start_plane; z_i < end_plane; ++z_i)
     {
-        if(_padding.size() > 2 && z_i < _padding[2].first)
+        if (_padding.size() > 2 && z_i < _padding[2].first)
         {
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
         }
-        else if(_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
+        else if (_padding.size() > 2 && z_i > (_input->info()->dimension(2) + _padding[2].first - 1))
         {
             memset(output_row_ptr, pad_value, output_plane_size);
             output_row_ptr += output_plane_size;
@@ -127,7 +138,7 @@
             output_row_ptr += pad_y_elems_top;
             size_t y_i = _input->info()->dimension(1);
             // Basic loop unrolling
-            for(; y_i > 3; y_i -= 4)
+            for (; y_i > 3; y_i -= 4)
             {
                 memset(output_row_ptr, pad_value, _padding[0].first);
                 output_row_ptr += _padding[0].first;
@@ -160,7 +171,7 @@
                 memset(output_row_ptr, pad_value, _padding[0].second);
                 output_row_ptr += _padding[0].second;
             }
-            for(; y_i > 0; --y_i)
+            for (; y_i > 0; --y_i)
             {
                 memset(output_row_ptr, pad_value, _padding[0].first);
                 output_row_ptr += _padding[0].first;
@@ -183,12 +194,17 @@
 {
 }
 
-void NEPadLayerKernel::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayerKernel::configure(ITensor           *input,
+                                 ITensor           *output,
+                                 const PaddingList &padding,
+                                 const PixelValue   constant_value,
+                                 const PaddingMode  mode)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     // Auto-init
-    const TensorShape expected_output_shape = arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
-    const TensorInfo  expected_output_info  = input->info()->clone()->set_tensor_shape(expected_output_shape);
+    const TensorShape expected_output_shape =
+        arm_compute::misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), padding);
+    const TensorInfo expected_output_info = input->info()->clone()->set_tensor_shape(expected_output_shape);
     auto_init_if_empty(*output->info(), expected_output_info);
 
     // Perform validation step
@@ -200,14 +216,14 @@
     _constant_value = constant_value;
     _mode           = mode;
 
-    if(_mode == PaddingMode::CONSTANT)
+    if (_mode == PaddingMode::CONSTANT)
     {
-        switch(_input->info()->element_size())
+        switch (_input->info()->element_size())
         {
             case 1:
-                if(_input->info()->num_dimensions() == 3 &&                           // Is 3D
-                   padding.size() <= 3 &&                                             // Has 3D padding
-                   !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
+                if (_input->info()->num_dimensions() == 3 &&                           // Is 3D
+                    padding.size() <= 3 &&                                             // Has 3D padding
+                    !_input->info()->has_padding() && !_output->info()->has_padding()) // Input & Output have no padding
                 {
                     _func = &NEPadLayerKernel::run_pad_constant_uint8_3Dinput_3Dpad;
                 }
@@ -240,7 +256,11 @@
     ICPPKernel::configure(win);
 }
 
-Status NEPadLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayerKernel::validate(const ITensorInfo *input,
+                                  const ITensorInfo *output,
+                                  const PaddingList &padding,
+                                  const PixelValue   constant_value,
+                                  const PaddingMode  mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, padding, mode));
@@ -253,7 +273,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    if(_func != nullptr)
+    if (_func != nullptr)
     {
         (this->*_func)(window);
     }
@@ -263,7 +283,7 @@
 {
     ARM_COMPUTE_UNUSED(thread_count);
     ARM_COMPUTE_UNUSED(platform);
-    
+
     return ICPPKernel::default_mws;
 }
 

diff --git a/src/core/NEON/kernels/NEPadLayerKernel.h b/src/core/NEON/kernels/NEPadLayerKernel.h
index f82af15..d432887 100644
--- a/src/core/NEON/kernels/NEPadLayerKernel.h
+++ b/src/core/NEON/kernels/NEPadLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEPADLAYERKERNEL_H
 
 #include "arm_compute/core/PixelValue.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -62,7 +63,11 @@
      * @param[in]  mode           (Optional) Controls whether the padding should be filled with @p constant_value using CONSTANT.
      *                           Only CONSTANT padding mode is currently supported
      */
-    void configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    void configure(ITensor           *input,
+                   ITensor           *output,
+                   const PaddingList &padding,
+                   const PixelValue   constant_value = PixelValue(),
+                   const PaddingMode  mode           = PaddingMode::CONSTANT);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEPadLayer.
      *
      * @param[in] input          Source tensor info. Data types supported: All.
@@ -75,7 +80,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value = PixelValue(), const PaddingMode mode = PaddingMode::CONSTANT);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value = PixelValue(),
+                           const PaddingMode  mode           = PaddingMode::CONSTANT);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
index 3d89933..15e933e 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -36,7 +37,10 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status validate_arguments(const ITensorInfo       *input1,
+                          const ITensorInfo       *input2,
+                          const ITensorInfo       *output,
+                          const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32);
@@ -45,10 +49,10 @@
 
     // Check variances
     const int var_size = info.variances().size();
-    if(var_size > 1)
+    if (var_size > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size != 4, "Must provide 4 variance values");
-        for(int i = 0; i < var_size; ++i)
+        for (int i = 0; i < var_size; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(var_size <= 0, "Must be greater than 0");
         }
@@ -56,17 +60,19 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[0] < 0.f, "Step x should be greater or equal to 0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.steps()[1] < 0.f, "Step y should be greater or equal to 0");
 
-    if(!info.max_sizes().empty())
+    if (!info.max_sizes().empty())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(), "Max and min sizes dimensions should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes().size() != info.min_sizes().size(),
+                                        "Max and min sizes dimensions should match");
     }
 
-    for(unsigned int i = 0; i < info.max_sizes().size(); ++i)
+    for (unsigned int i = 0; i < info.max_sizes().size(); ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i], "Max size should be greater than min size");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_sizes()[i] < info.min_sizes()[i],
+                                        "Max size should be greater than min size");
     }
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != 2);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, output);
@@ -76,21 +82,26 @@
 }
 } // namespace
 
-NEPriorBoxLayerKernel::NEPriorBoxLayerKernel()
-    : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
+NEPriorBoxLayerKernel::NEPriorBoxLayerKernel() : _input1(nullptr), _input2(nullptr), _output(nullptr), _info()
 {
 }
 
-void NEPriorBoxLayerKernel::store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width,
-                                              const int height)
+void NEPriorBoxLayerKernel::store_coordinates(float      *out,
+                                              const int   offset,
+                                              const float center_x,
+                                              const float center_y,
+                                              const float box_width,
+                                              const float box_height,
+                                              const int   width,
+                                              const int   height)
 {
     float xmin = (center_x - box_width / 2.f) / width;
     float ymin = (center_y - box_height / 2.f) / height;
     float xmax = (center_x + box_width / 2.f) / width;
     float ymax = (center_y + box_height / 2.f) / height;
 
-    float32x4_t vec_elements = { xmin, ymin, xmax, ymax };
-    if(_info.clip())
+    float32x4_t vec_elements = {xmin, ymin, xmax, ymax};
+    if (_info.clip())
     {
         static const float32x4_t CONST_0 = vdupq_n_f32(0.f);
         static const float32x4_t CONST_1 = vdupq_n_f32(1.f);
@@ -112,7 +123,7 @@
 
     int img_width  = _info.img_size().x;
     int img_height = _info.img_size().y;
-    if(img_width == 0 || img_height == 0)
+    if (img_width == 0 || img_height == 0)
     {
         img_width  = _input2->info()->dimension(width_idx);
         img_height = _input2->info()->dimension(height_idx);
@@ -120,7 +131,7 @@
 
     float step_x = _info.steps()[0];
     float step_y = _info.steps()[1];
-    if(step_x == 0.f || step_y == 0.f)
+    if (step_x == 0.f || step_y == 0.f)
     {
         step_x = static_cast<float>(img_width) / layer_width;
         step_y = static_cast<float>(img_height) / layer_height;
@@ -130,74 +141,80 @@
     slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1), 2));
 
     Iterator output(_output, slice);
-    execute_window_loop(slice, [&](const Coordinates & id)
-    {
-        float center_x = 0;
-        float center_y = 0;
-        int   idx      = id.x() / (4 * num_priors);
-        center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
-        center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
-
-        float box_width;
-        float box_height;
-        int   offset = 0;
-
-        auto out = reinterpret_cast<float *>(output.ptr());
-        for(unsigned int i = 0; i < _info.min_sizes().size(); ++i)
+    execute_window_loop(
+        slice,
+        [&](const Coordinates &id)
         {
-            const float min_size = _info.min_sizes().at(i);
-            box_width            = min_size;
-            box_height           = min_size;
-            store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
-            offset += 4;
+            float center_x = 0;
+            float center_y = 0;
+            int   idx      = id.x() / (4 * num_priors);
+            center_x       = (static_cast<float>(idx % layer_width) + _info.offset()) * step_x;
+            center_y       = (static_cast<float>(idx / layer_width) + _info.offset()) * step_y;
 
-            if(!_info.max_sizes().empty())
+            float box_width;
+            float box_height;
+            int   offset = 0;
+
+            auto out = reinterpret_cast<float *>(output.ptr());
+            for (unsigned int i = 0; i < _info.min_sizes().size(); ++i)
             {
-                const float max_size = _info.max_sizes().at(i);
-                box_width            = std::sqrt(min_size * max_size);
-                box_height           = box_width;
-
+                const float min_size = _info.min_sizes().at(i);
+                box_width            = min_size;
+                box_height           = min_size;
                 store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
                 offset += 4;
-            }
 
-            // rest of priors
-            for(auto ar : _info.aspect_ratios())
-            {
-                if(fabs(ar - 1.) < 1e-6)
+                if (!_info.max_sizes().empty())
                 {
-                    continue;
+                    const float max_size = _info.max_sizes().at(i);
+                    box_width            = std::sqrt(min_size * max_size);
+                    box_height           = box_width;
+
+                    store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                    offset += 4;
                 }
 
-                box_width  = min_size * sqrt(ar);
-                box_height = min_size / sqrt(ar);
+                // rest of priors
+                for (auto ar : _info.aspect_ratios())
+                {
+                    if (fabs(ar - 1.) < 1e-6)
+                    {
+                        continue;
+                    }
 
-                store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
-                offset += 4;
+                    box_width  = min_size * sqrt(ar);
+                    box_height = min_size / sqrt(ar);
+
+                    store_coordinates(out, offset, center_x, center_y, box_width, box_height, img_width, img_height);
+                    offset += 4;
+                }
             }
-        }
 
-        // set the variance
-        out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
-        float32x4_t var;
-        if(_info.variances().size() == 1)
-        {
-            var = vdupq_n_f32(_info.variances().at(0));
-        }
-        else
-        {
-            const float32x4_t vars = { _info.variances().at(0), _info.variances().at(1), _info.variances().at(2), _info.variances().at(3) };
-            var                    = vars;
-        }
-        for(int i = 0; i < num_priors; ++i)
-        {
-            vst1q_f32(out + 4 * i, var);
-        }
-    },
-    output);
+            // set the variance
+            out = reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(id.x(), 1)));
+            float32x4_t var;
+            if (_info.variances().size() == 1)
+            {
+                var = vdupq_n_f32(_info.variances().at(0));
+            }
+            else
+            {
+                const float32x4_t vars = {_info.variances().at(0), _info.variances().at(1), _info.variances().at(2),
+                                          _info.variances().at(3)};
+                var                    = vars;
+            }
+            for (int i = 0; i < num_priors; ++i)
+            {
+                vst1q_f32(out + 4 * i, var);
+            }
+        },
+        output);
 }
 
-void NEPriorBoxLayerKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayerKernel::configure(const ITensor           *input1,
+                                      const ITensor           *input2,
+                                      ITensor                 *output,
+                                      const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
 
@@ -215,7 +232,10 @@
     INEKernel::configure(win);
 }
 
-Status NEPriorBoxLayerKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayerKernel::validate(const ITensorInfo       *input1,
+                                       const ITensorInfo       *input2,
+                                       const ITensorInfo       *output,
+                                       const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, output, info));
@@ -231,4 +251,4 @@
     // Run function
     calculate_prior_boxes(window);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
index 430a47f..460f80e 100644
--- a/src/core/NEON/kernels/NEPriorBoxLayerKernel.h
+++ b/src/core/NEON/kernels/NEPriorBoxLayerKernel.h

@@ -67,7 +67,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info);
+    static Status validate(const ITensorInfo       *input1,
+                           const ITensorInfo       *input2,
+                           const ITensorInfo       *output,
+                           const PriorBoxLayerInfo &info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -84,7 +87,14 @@
      * @param[in]  width      Input width.
      * @param[in]  height     Input height.
      */
-    void store_coordinates(float *out, const int offset, const float center_x, const float center_y, const float box_width, const float box_height, const int width, const int height);
+    void store_coordinates(float      *out,
+                           const int   offset,
+                           const float center_x,
+                           const float center_y,
+                           const float box_width,
+                           const float box_height,
+                           const int   width,
+                           const int   height);
     /** Function to calculate prior boxes.
      *
      * @param[in] window Input region on which to execute the kernel.

diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
index 46a0f62..8e1ed3a 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.cpp

@@ -26,17 +26,17 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/NESymm.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
 
 #include <map>
 
@@ -72,8 +72,8 @@
     const int64_t b_3 = vgetlane(b_high, 1);
 
     int64x2x2_t     result;
-    const int64x2_t result_0{ a_0 * b_0, a_1 * b_1 };
-    const int64x2_t result_1{ a_2 * b_2, a_3 * b_3 };
+    const int64x2_t result_0{a_0 * b_0, a_1 * b_1};
+    const int64x2_t result_1{a_2 * b_2, a_3 * b_3};
     result.val[0] = vadd(vmovl(vgetlow(bias)), result_0);
     result.val[1] = vadd(vmovl(vgethigh(bias)), result_1);
 
@@ -81,15 +81,17 @@
 }
 } // namespace
 
-void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input, ITensor *output, const ITensor *weight, const ITensor *bias)
+void NEQLSTMLayerNormalizationKernel::configure(const ITensor *input,
+                                                ITensor       *output,
+                                                const ITensor *weight,
+                                                const ITensor *bias)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weight, bias, output);
     ARM_COMPUTE_ERROR_ON(input == output);
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), weight->info(), bias->info()));
 
-    static const std::map<DataType, ComputeFuncType> fn_map =
-    {
-        { DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16) },
+    static const std::map<DataType, ComputeFuncType> fn_map = {
+        {DataType::QSYMM16, std::mem_fn(&NEQLSTMLayerNormalizationKernel::compute_qsymm16)},
     };
 
     _input  = input;
@@ -102,10 +104,10 @@
     _output->info()->set_quantization_info(compute_output_qinfo());
 
     const UniformQuantizationInfo wq_info = _weight->info()->quantization_info().uniform();
-    const Status                  s       = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
+    const Status s = quantization::calculate_quantized_multiplier(wq_info.scale, &_output_multiplier, &_output_shift);
     _output_shift *= -1;
 
-    if(!bool(s))
+    if (!bool(s))
     {
         _output_multiplier = 0;
         _output_shift      = 0;
@@ -134,7 +136,10 @@
     return window;
 }
 
-Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias)
+Status NEQLSTMLayerNormalizationKernel::validate(const ITensorInfo *input,
+                                                 const ITensorInfo *output,
+                                                 const ITensorInfo *weight,
+                                                 const ITensorInfo *bias)
 {
     ARM_COMPUTE_UNUSED(output, bias, weight, input);
 
@@ -151,7 +156,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape().x() != weight->tensor_shape().x());
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(weight, bias);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
@@ -182,11 +187,11 @@
     using AccType       = int64_t;
     using InputDataType = int16_t;
 
-    AccType sum{ 0 };
-    AccType sum_sq{ 0 };
+    AccType sum{0};
+    AccType sum_sq{0};
 
     int32_t x = _window_start_x;
-    for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+    for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
     {
         using namespace wrapper;
         const int16x8_t val      = vloadq(input_ptr + x);
@@ -216,7 +221,7 @@
 #endif // __aarch64__
     }
 
-    for(; x < _window_end_x; ++x)
+    for (; x < _window_end_x; ++x)
     {
         const InputDataType val = input_ptr[x];
         sum += static_cast<AccType>(val);
@@ -230,7 +235,9 @@
                                                                 int16_t       *output_ptr,
                                                                 const int16_t *weight_ptr,
                                                                 const int32_t *bias_ptr,
-                                                                int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift)
+                                                                int32_t        mean,
+                                                                int32_t        inv_std_mul,
+                                                                int32_t        inv_std_shift)
 {
     using OutputDataType = int16_t;
 
@@ -238,7 +245,7 @@
     const int32x4_t mean_vec = vdup_n(mean, wrapper::traits::vector_128_tag{});
 
     int32_t x = _window_start_x;
-    for(; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
+    for (; x <= _window_end_x && _window_step_x <= (_window_end_x - x); x += _window_step_x)
     {
         const int16x8_t val = vloadq(input_ptr + x);
         int32x4x2_t     shifted;
@@ -267,16 +274,18 @@
         vstore(output_ptr + x + 4, vqmovn(out_val.val[1]));
     }
 
-    for(; x < _window_end_x; ++x)
+    for (; x < _window_end_x; ++x)
     {
-        const auto    val             = static_cast<int32_t>(input_ptr[x]);
-        const int32_t shifted         = (val << 10) - mean;
-        const int32_t rescaled        = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
-        const int64_t weighted        = rescaled * weight_ptr[x] + bias_ptr[x];
+        const auto    val      = static_cast<int32_t>(input_ptr[x]);
+        const int32_t shifted  = (val << 10) - mean;
+        const int32_t rescaled = quantization::multiply_by_quantized_multiplier(shifted, inv_std_mul, inv_std_shift);
+        const int64_t weighted = rescaled * weight_ptr[x] + bias_ptr[x];
         const auto    reverse_shifted = static_cast<int32_t>((weighted + 512) >> 10);
-        int32_t       out_val         = quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
-        out_val                       = utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
-        output_ptr[x]                 = static_cast<OutputDataType>(out_val);
+        int32_t       out_val =
+            quantization::multiply_by_quantized_multiplier(reverse_shifted, _output_multiplier, _output_shift + 12);
+        out_val =
+            utility::clamp<decltype(out_val), OutputDataType>(out_val, std::numeric_limits<OutputDataType>::min());
+        output_ptr[x] = static_cast<OutputDataType>(out_val);
     }
 }
 
@@ -287,35 +296,38 @@
     using BiasDataType   = int32_t;
     using AccType        = int64_t;
 
-    Iterator input_iterator{ _input, _inout_window };
-    Iterator output_iterator{ _output, _inout_window };
-    Iterator weight_iterator{ _weight, _weight_window };
-    Iterator bias_iterator{ _bias, _weight_window };
+    Iterator input_iterator{_input, _inout_window};
+    Iterator output_iterator{_output, _inout_window};
+    Iterator weight_iterator{_weight, _weight_window};
+    Iterator bias_iterator{_bias, _weight_window};
 
     const auto weight_ptr = reinterpret_cast<const InputDataType *>(weight_iterator.ptr());
     const auto bias_ptr   = reinterpret_cast<const BiasDataType *>(bias_iterator.ptr());
 
     const uint32_t column_size = _input->info()->tensor_shape()[0];
 
-    execute_window_loop(_inout_window, [ &, this](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
-        auto       out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
+    execute_window_loop(
+        _inout_window,
+        [&, this](const Coordinates &)
+        {
+            const auto in_ptr  = reinterpret_cast<const InputDataType *>(input_iterator.ptr());
+            auto       out_ptr = reinterpret_cast<OutputDataType *>(output_iterator.ptr());
 
-        AccType sum{ 0 };
-        AccType sum_sq{ 0 };
-        std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
+            AccType sum{0};
+            AccType sum_sq{0};
+            std::tie(sum, sum_sq) = sum_qsymm16(in_ptr);
 
-        AccType mean{ 0 };
-        AccType variance{ 0 };
-        std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
+            AccType mean{0};
+            AccType variance{0};
+            std::tie(mean, variance) = compute_mean_variance(sum, sum_sq, column_size);
 
-        int32_t stddev_invsqrt_mul{};
-        int32_t stddev_invsqrt_shift{};
-        quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul, stddev_invsqrt_shift);
+            int32_t stddev_invsqrt_mul{};
+            int32_t stddev_invsqrt_shift{};
+            quantization::get_invsqrt_quantized_multiplier_exp(static_cast<int32_t>(variance), -1, stddev_invsqrt_mul,
+                                                               stddev_invsqrt_shift);
 
-        normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
-    },
-    input_iterator, output_iterator);
+            normalize_qasymm16(in_ptr, out_ptr, weight_ptr, bias_ptr, mean, stddev_invsqrt_mul, stddev_invsqrt_shift);
+        },
+        input_iterator, output_iterator);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
index a3ff6e9..af5b6a0 100644
--- a/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h
+++ b/src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEQLSTMLAYERNORMALIZATIONKERNEL_H
 
 #include "src/core/NEON/INEKernel.h"
+
 #include <functional>
 
 namespace arm_compute
@@ -69,34 +70,26 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *weight, const ITensorInfo *bias);
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
     // constants
-    static constexpr uint32_t max_input_dimension{ 2 };  /**< The maximum input dimension supported */
-    static constexpr uint32_t max_weight_dimension{ 1 }; /**< The maximum weight dimension supported */
-    static constexpr uint32_t max_bias_dimension{ 1 };   /**< The maximum bias dimension supported */
-    static constexpr uint32_t vector_size_byte{ 16 };    /**< Computation vector size in byte */
+    static constexpr uint32_t max_input_dimension{2};  /**< The maximum input dimension supported */
+    static constexpr uint32_t max_weight_dimension{1}; /**< The maximum weight dimension supported */
+    static constexpr uint32_t max_bias_dimension{1};   /**< The maximum bias dimension supported */
+    static constexpr uint32_t vector_size_byte{16};    /**< Computation vector size in byte */
 
     using ComputeFuncType = std::function<void(NEQLSTMLayerNormalizationKernel &)>;
 
     ComputeFuncType _fn{}; /**< Function pointer to computation function */
 
-    const ITensor *_input
-    {
-        nullptr
-    }; /**< Input tensor */
-    const ITensor *_weight
-    {
-        nullptr
-    }; /**< Weight tensor */
-    const ITensor *_bias
-    {
-        nullptr
-    };                           /**< Bias tensor */
-    ITensor *_output{ nullptr }; /**< Output tensor */
+    const ITensor *_input{nullptr};  /**< Input tensor */
+    const ITensor *_weight{nullptr}; /**< Weight tensor */
+    const ITensor *_bias{nullptr};   /**< Bias tensor */
+    ITensor       *_output{nullptr}; /**< Output tensor */
 
     int32_t _output_multiplier{}; /**< Multiplier for output values */
     int32_t _output_shift{};      /**< Shift value for output values */
@@ -138,7 +131,9 @@
                             int16_t       *output_ptr,
                             const int16_t *weight_ptr,
                             const int32_t *bias_ptr,
-                            int32_t mean, int32_t inv_std_mul, int32_t inv_std_shift);
+                            int32_t        mean,
+                            int32_t        inv_std_mul,
+                            int32_t        inv_std_shift);
     /** Function to compute output quantization information */
     QuantizationInfo compute_output_qinfo();
 };

diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
index 802aebb..486cd6d 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.cpp

@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/misc/Utility.h"
-#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/roialign/list.h"
@@ -49,7 +50,12 @@
 };
 
 using ROIAlignSelctorPtr = std::add_pointer<bool(const ROIAlignSelectorData &data)>::type;
-using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)>::type;
+using ROIAlignUKernelPtr = std::add_pointer<void(const ITensor      *input,
+                                                 ITensor            *output,
+                                                 const ITensor      *rois,
+                                                 ROIPoolingLayerInfo pool_info,
+                                                 const Window       &window,
+                                                 const ThreadInfo   &info)>::type;
 
 struct ROIAlignKernel
 {
@@ -58,31 +64,18 @@
     ROIAlignUKernelPtr       ukernel;
 };
 
-static const ROIAlignKernel available_kernels[] =
-{
-    {
-        "fp32_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)
-    },
+static const ROIAlignKernel available_kernels[] = {
+    {"fp32_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_roialign)},
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-    {
-        "fp16_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)
-    },
+    {"fp16_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_roialign)},
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 #if defined(ARM_COMPUTE_ENABLE_NEON)
-    {
-        "qu8_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)
-    },
-    {
-        "qs8_neon_roialign",
-        [](const ROIAlignSelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)
-    },
+    {"qu8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qu8_roialign)},
+    {"qs8_neon_roialign", [](const ROIAlignSelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qs8_roialign)},
 #endif //defined(ARM_COMPUTE_ENABLE_NEON)
 };
 
@@ -94,9 +87,9 @@
  */
 const ROIAlignKernel *get_implementation(const ROIAlignSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -104,24 +97,29 @@
     return nullptr;
 }
 
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          ITensorInfo               *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, rois, output);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(0) != 5);
     ARM_COMPUTE_RETURN_ERROR_ON(rois->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC, DataLayout::NCHW);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(compute_roi_align_shape(*input, *rois, pool_info),
+                                                           output->tensor_shape());
     }
 
-    if(input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
+    if (input->data_type() == DataType::QASYMM8 || input->data_type() == DataType::QASYMM8_SIGNED)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rois, 1, DataType::QASYMM16);
 
@@ -143,13 +141,17 @@
 {
 }
 
-void NEROIAlignLayerKernel::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayerKernel::configure(const ITensor             *input,
+                                      const ITensor             *rois,
+                                      ITensor                   *output,
+                                      const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
     // Output auto inizialitation if not yet initialized
     const TensorShape output_shape = compute_roi_align_shape(*input->info(), *rois->info(), pool_info);
-    auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    auto_init_if_empty((*output->info()), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
     output->info()->set_data_layout(input->info()->data_layout());
 
     // Configure kernel window
@@ -167,7 +169,10 @@
     INEKernel::configure(window);
 }
 
-Status NEROIAlignLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayerKernel::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *rois,
+                                       ITensorInfo               *output,
+                                       const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
@@ -176,9 +181,9 @@
 void NEROIAlignLayerKernel::run(const Window &window, const ThreadInfo &info)
 {
     const DataLayout data_layout = _input->info()->data_layout();
-    if(data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NCHW || data_layout == DataLayout::NHWC)
     {
-        const auto *uk = get_implementation(ROIAlignSelectorData{ _input->info()->data_type() });
+        const auto *uk = get_implementation(ROIAlignSelectorData{_input->info()->data_type()});
         ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
         uk->ukernel(_input, _output, _rois, _pool_info, window, info);

diff --git a/src/core/NEON/kernels/NEROIAlignLayerKernel.h b/src/core/NEON/kernels/NEROIAlignLayerKernel.h
index 48a3de7..9cc538b 100644
--- a/src/core/NEON/kernels/NEROIAlignLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIAlignLayerKernel.h

@@ -83,7 +83,10 @@
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           ITensorInfo               *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
index 400e829..1a3810f 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.cpp

@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
+
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -36,7 +38,10 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *rois,
+                          const ITensorInfo         *output,
+                          const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, rois);
 
@@ -47,10 +52,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON((pool_info.pooled_width() == 0) || (pool_info.pooled_height() == 0));
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) || (output->dimension(1) != pool_info.pooled_height()));
+        ARM_COMPUTE_RETURN_ERROR_ON((output->dimension(0) != pool_info.pooled_width()) ||
+                                    (output->dimension(1) != pool_info.pooled_height()));
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(2) != output->dimension(2));
         ARM_COMPUTE_RETURN_ERROR_ON(rois->dimension(1) != output->dimension(3));
     }
@@ -73,19 +79,28 @@
  * @param[in]  roi_indx       Index of image of coordinate in output Tensor to store value
  */
 template <typename T>
-void template_eval(const ITensor *input, const ITensor *output, int region_start_x, int region_start_y,
-                   int region_end_x, int region_end_y, int fm, int px, int py, int roi_batch, int roi_indx)
+void template_eval(const ITensor *input,
+                   const ITensor *output,
+                   int            region_start_x,
+                   int            region_start_y,
+                   int            region_end_x,
+                   int            region_end_y,
+                   int            fm,
+                   int            px,
+                   int            py,
+                   int            roi_batch,
+                   int            roi_indx)
 {
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
     {
         *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = 0;
     }
     else
     {
         T curr_max = std::numeric_limits<T>::lowest(); // Min value of typename T
-        for(int j = region_start_y; j < region_end_y; ++j)
+        for (int j = region_start_y; j < region_end_y; ++j)
         {
-            for(int i = region_start_x; i < region_end_x; ++i)
+            for (int i = region_start_x; i < region_end_x; ++i)
             {
                 const auto val = *reinterpret_cast<const T *>(input->ptr_to_element(Coordinates(i, j, fm, roi_batch)));
                 curr_max       = std::max(val, curr_max);
@@ -93,11 +108,13 @@
         }
 
         // if quantized datatype, requantize then store in output tensor
-        if(is_data_type_quantized(input->info()->data_type()))
+        if (is_data_type_quantized(input->info()->data_type()))
         {
             // covert qasymm to new output quantization scale and offset
-            UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
-            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) = quantize_qasymm8(curr_max, uqinfo);
+            UniformQuantizationInfo uqinfo = compute_requantization_scale_offset(
+                input->info()->quantization_info().uniform(), output->info()->quantization_info().uniform());
+            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(px, py, fm, roi_indx))) =
+                quantize_qasymm8(curr_max, uqinfo);
         }
         else
         {
@@ -112,13 +129,19 @@
 {
 }
 
-Status NEROIPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayerKernel::validate(const ITensorInfo         *input,
+                                         const ITensorInfo         *rois,
+                                         const ITensorInfo         *output,
+                                         const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, rois, output, pool_info));
     return Status{};
 }
 
-void NEROIPoolingLayerKernel::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayerKernel::configure(const ITensor             *input,
+                                        const ITensor             *rois,
+                                        const ITensor             *output,
+                                        const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, rois);
 
@@ -126,12 +149,15 @@
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), rois->info(), output->info(), pool_info));
 
     // Output auto initialization if not yet initialized
-    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2), rois->info()->dimension(1));
+    TensorShape output_shape(pool_info.pooled_width(), pool_info.pooled_height(), input->info()->dimension(2),
+                             rois->info()->dimension(1));
 
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), output->info()->quantization_info());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       output->info()->quantization_info());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) || (output->info()->dimension(1) != pool_info.pooled_height()));
+    ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pool_info.pooled_width()) ||
+                         (output->info()->dimension(1) != pool_info.pooled_height()));
 
     // Set instance variables
     _input     = input;
@@ -167,7 +193,7 @@
     const auto *rois_ptr  = reinterpret_cast<const uint16_t *>(_rois->buffer());
     const auto  data_type = _input->info()->data_type();
 
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
     {
         const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
         const auto         x1        = rois_ptr[values_per_roi * roi_indx + 1];
@@ -182,30 +208,35 @@
         const int roi_height   = std::max(support::cpp11::round((y2 - y1) * spatial_scale), 1.f);
 
         // Iterate through all feature maps
-        for(int fm = 0; fm < fms; ++fm)
+        for (int fm = 0; fm < fms; ++fm)
         {
             // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
+            for (int py = 0; py < pooled_h; ++py)
             {
-                for(int px = 0; px < pooled_w; ++px)
+                for (int px = 0; px < pooled_w; ++px)
                 {
                     auto region_start_x = static_cast<int>(std::floor((static_cast<float>(px) / pooled_w) * roi_width));
-                    auto region_end_x   = static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
-                    auto region_start_y = static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
-                    auto region_end_y   = static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
+                    auto region_end_x =
+                        static_cast<int>(std::floor((static_cast<float>(px + 1) / pooled_w) * roi_width));
+                    auto region_start_y =
+                        static_cast<int>(std::floor((static_cast<float>(py) / pooled_h) * roi_height));
+                    auto region_end_y =
+                        static_cast<int>(std::floor((static_cast<float>(py + 1) / pooled_h) * roi_height));
 
                     region_start_x = std::min(std::max(region_start_x + roi_anchor_x, 0), width);
                     region_end_x   = std::min(std::max(region_end_x + roi_anchor_x, 0), width);
                     region_start_y = std::min(std::max(region_start_y + roi_anchor_y, 0), height);
                     region_end_y   = std::min(std::max(region_end_y + roi_anchor_y, 0), height);
 
-                    switch(data_type)
+                    switch (data_type)
                     {
                         case DataType::F32:
-                            template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+                            template_eval<float>(_input, _output, region_start_x, region_start_y, region_end_x,
+                                                 region_end_y, fm, px, py, roi_batch, roi_indx);
                             break;
                         case DataType::QASYMM8:
-                            template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x, region_end_y, fm, px, py, roi_batch, roi_indx);
+                            template_eval<qasymm8_t>(_input, _output, region_start_x, region_start_y, region_end_x,
+                                                     region_end_y, fm, px, py, roi_batch, roi_indx);
                             break;
                         default:
                             ARM_COMPUTE_ERROR("DataType not Supported");
@@ -216,4 +247,4 @@
         }
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
index e7a7e90..81f6006 100644
--- a/src/core/NEON/kernels/NEROIPoolingLayerKernel.h
+++ b/src/core/NEON/kernels/NEROIPoolingLayerKernel.h

@@ -63,7 +63,8 @@
      * @note The z dimensions of @p output tensor and @p input tensor must be the same.
      * @note The fourth dimension of @p output tensor must be the same as the number of elements in @p rois tensor.
      */
-    void configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
+    void
+    configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
@@ -82,7 +83,10 @@
      *
      * @return a Status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo         *input,
+                           const ITensorInfo         *rois,
+                           const ITensorInfo         *output,
+                           const ROIPoolingLayerInfo &pool_info);
 
 private:
     const ITensor      *_input;

diff --git a/src/core/NEON/kernels/NERangeKernel.cpp b/src/core/NEON/kernels/NERangeKernel.cpp
index ec63a35..87b7b76 100644
--- a/src/core/NEON/kernels/NERangeKernel.cpp
+++ b/src/core/NEON/kernels/NERangeKernel.cpp

@@ -29,11 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/range/list.h"
 
 namespace arm_compute
@@ -55,48 +56,23 @@
     RangeUKernelPtr        ukernel;
 };
 
-static const RangeUKernel available_kernels[] =
-{
-    {
-        "fp16_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::F16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)
-    },
-    {
-        "f32_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)
-    },
-    {
-        "u8_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)
-    },
-    {
-        "u16_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::U16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)
-    },
-    {
-        "u32_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::U32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)
-    },
-    {
-        "s8_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::S8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)
-    },
-    {
-        "s16_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)
-    },
-    {
-        "s32_neon_range",
-        [](const RangeSelectorData & data) { return data.dt == DataType::S32; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)
-    },
+static const RangeUKernel available_kernels[] = {
+    {"fp16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_range_function)},
+    {"f32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_range_function)},
+    {"u8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_range_function)},
+    {"u16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u16_neon_range_function)},
+    {"u32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::U32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u32_neon_range_function)},
+    {"s8_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_range_function)},
+    {"s16_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_range_function)},
+    {"s32_neon_range", [](const RangeSelectorData &data) { return data.dt == DataType::S32; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s32_neon_range_function)},
 };
 
 /** Micro-kernel selector
@@ -107,9 +83,9 @@
  */
 const RangeUKernel *get_implementation(const RangeSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -119,28 +95,31 @@
 
 Status validate_arguments(const ITensorInfo &output, const float start, const float end, const float step)
 {
-    const auto *uk = get_implementation(RangeSelectorData{ output.data_type() });
+    const auto *uk = get_implementation(RangeSelectorData{output.data_type()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start < end) && (step <= 0)), "step must be greater than 0 when start < end");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((start > end) && (step >= 0)), "step must be less than 0 when start > end");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()), "start value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()), "end value is outside the range of the data type");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()), "step value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(start, output.data_type(), output.quantization_info()),
+                                    "start value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(end, output.data_type(), output.quantization_info()),
+                                    "end value is outside the range of the data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!check_value_range(step, output.data_type(), output.quantization_info()),
+                                    "step value is outside the range of the data type");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((start == end), "start of the requested sequence must not be equal to the end");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.num_dimensions() != 1, "Output has to be a 1-D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step), "Output tensor size is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output.tensor_shape().total_size() < num_of_elements_in_range(start, end, step),
+                                    "Output tensor size is incorrect");
 
     return Status{};
 }
 } // namespace
 
-NERangeKernel::NERangeKernel()
-    : _start(0), _end(1), _step(1), _output(nullptr)
+NERangeKernel::NERangeKernel() : _start(0), _end(1), _step(1), _output(nullptr)
 {
 }
 
@@ -151,7 +130,8 @@
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*(output->info()), start, end, step));
 
     // Auto initialize output if not initialized
-    auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1, output->info()->data_type(), output->info()->quantization_info());
+    auto_init_if_empty(*output->info(), TensorShape(num_of_elements_in_range(start, end, step)), 1,
+                       output->info()->data_type(), output->info()->quantization_info());
 
     // Configure kernel window
     Window win = calculate_max_window(*output->info(), Steps());
@@ -178,7 +158,7 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    const auto *uk = get_implementation(RangeSelectorData{ _output->info()->data_type() });
+    const auto *uk = get_implementation(RangeSelectorData{_output->info()->data_type()});
 
     uk->ukernel(_output, _start, _step, window);
 }

diff --git a/src/core/NEON/kernels/NERangeKernel.h b/src/core/NEON/kernels/NERangeKernel.h
index 9056099..fa555c2 100644
--- a/src/core/NEON/kernels/NERangeKernel.h
+++ b/src/core/NEON/kernels/NERangeKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NERANGEKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute

diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp
index 19955af..455d604 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp

@@ -28,16 +28,17 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
-#include "src/core/NEON/NEMath.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+#include "src/core/NEON/NEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "support/SaturateCast.h"
 
-#include "src/core/NEON/wrapper/wrapper.h"
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -48,7 +49,7 @@
 template <typename T>
 void combine_and_store(int16x8_t t1, int16x8_t t2, Iterator &output, int offset = 0)
 {
-    if(std::is_same<T, uint8_t>::value)
+    if (std::is_same<T, uint8_t>::value)
     {
         auto res = wrapper::vcombine(wrapper::vqmovun(t1), wrapper::vqmovun(t2));
         wrapper::vstore(output.ptr() + offset, res);
@@ -63,8 +64,8 @@
 template <typename T>
 uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
-    uint32x4_t mask{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint32x4_t mask{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         mask = wrapper::vcgt(b, a);
     }
@@ -73,12 +74,12 @@
         mask = wrapper::vclt(b, a);
     }
 
-    uint32x4_t vec_idx = { idx, idx + 1, idx + 2, idx + 3 };
-    if(axis != 0)
+    uint32x4_t vec_idx = {idx, idx + 1, idx + 2, idx + 3};
+    if (axis != 0)
     {
         vec_idx = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
     }
-    uint32x4x4_t res = { { wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0 } };
+    uint32x4x4_t res = {{wrapper::vbsl(mask, vec_idx, c.val[0]), 0, 0, 0}};
 
     return res;
 }
@@ -86,9 +87,9 @@
 template <typename T>
 uint32x4x4_t calculate_index_quantized(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
-    uint32x4x4_t mask{ { 0 } };
-    uint8x16_t   mask_u8{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint32x4x4_t mask{{0}};
+    uint8x16_t   mask_u8{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         mask_u8 = wrapper::vcgt(b, a);
     }
@@ -96,44 +97,43 @@
     {
         mask_u8 = wrapper::vclt(b, a);
     }
-    auto wide_u16_1 = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
-    auto wide_u16_2 = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
-    mask.val[0]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
-    mask.val[1]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
-    mask.val[2]     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
-    mask.val[3]     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+    auto wide_u16_1 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    mask.val[0] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    mask.val[1] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    mask.val[2] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    mask.val[3] =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
 
-    uint32x4x4_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
-            { idx + 4, idx + 5, idx + 6, idx + 7 },
-            { idx + 8, idx + 9, idx + 10, idx + 11 },
-            { idx + 12, idx + 13, idx + 14, idx + 15 }
-        }
-    };
-    if(axis != 0)
+    uint32x4x4_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3},
+                             {idx + 4, idx + 5, idx + 6, idx + 7},
+                             {idx + 8, idx + 9, idx + 10, idx + 11},
+                             {idx + 12, idx + 13, idx + 14, idx + 15}}};
+    if (axis != 0)
     {
         vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[2] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[3] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
     }
-    uint32x4x4_t res =
-    {
-        {
-            vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]),
-            vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
-            vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]),
-            vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])
-        }
-    };
+    uint32x4x4_t res = {
+        {vbslq_u32(mask.val[0], vec_idx.val[0], c.val[0]), vbslq_u32(mask.val[1], vec_idx.val[1], c.val[1]),
+         vbslq_u32(mask.val[2], vec_idx.val[2], c.val[2]), vbslq_u32(mask.val[3], vec_idx.val[3], c.val[3])}};
 
     return res;
 }
 
 // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
-       typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
-       calculate_min(T in)
+inline typename std::enable_if<
+    std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+    typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_min(T in)
 {
     auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
     return wrapper::vpmin(pmin, pmin);
@@ -141,9 +141,10 @@
 
 // Helper function to calculate the minimum value of the input vector. All the elements in the output vector contain the min value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
-       typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
-       calculate_min(T in)
+inline typename std::enable_if<
+    std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+    typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_min(T in)
 {
     auto pmin = wrapper::vpmin(wrapper::vgethigh(in), wrapper::vgetlow(in));
     pmin      = wrapper::vpmin(pmin, pmin);
@@ -153,9 +154,10 @@
 
 // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
-       typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type >::type
-       calculate_max(T in)
+inline typename std::enable_if<
+    std::is_same<T, float32x4_t>::value || std::is_same<T, int32x4_t>::value,
+    typename std::conditional<std::is_same<T, float32x4_t>::value, float32x2_t, int32x2_t>::type>::type
+calculate_max(T in)
 {
     auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
     return wrapper::vpmax(pmax, pmax);
@@ -163,9 +165,10 @@
 
 // Helper function to calculate the maximum value of the input vector. All the elements in the output vector contain the max value.
 template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
-       typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type >::type
-       calculate_max(T in)
+inline typename std::enable_if<
+    std::is_same<T, uint8x16_t>::value || std::is_same<T, int8x16_t>::value,
+    typename std::conditional<std::is_same<T, uint8x16_t>::value, uint8x8_t, int8x8_t>::type>::type
+calculate_max(T in)
 {
     auto pmax = wrapper::vpmax(wrapper::vgethigh(in), wrapper::vgetlow(in));
     pmax      = wrapper::vpmax(pmax, pmax);
@@ -176,10 +179,10 @@
 template <typename T>
 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
 {
-    uint32x4_t res_idx_mask{ 0 };
+    uint32x4_t res_idx_mask{0};
     uint32x4_t mask_ones = vdupq_n_u32(0xFFFFFFFF);
 
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         auto pmin    = calculate_min(vec_res_value);
         auto mask    = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -203,10 +206,10 @@
 template <typename T>
 uint32_t calculate_vector_index_quantized(uint32x4x4_t vec_res_idx, T vec_res_value, ReductionOperation op)
 {
-    uint32x4x4_t res_idx_mask{ { 0 } };
+    uint32x4x4_t res_idx_mask{{0}};
     uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
-    uint8x16_t   mask_u8{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint8x16_t   mask_u8{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         auto pmin = calculate_min(vec_res_value);
         mask_u8   = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -218,12 +221,18 @@
     }
 
     // Widen vectors
-    auto wide_u16_1     = wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
-    auto wide_u16_2     = wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
-    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
-    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
-    auto wide_u32_3     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
-    auto wide_u32_4     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
+    auto wide_u16_1 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgetlow(mask_u8), 8), wrapper::vmovl(wrapper::vgetlow(mask_u8)));
+    auto wide_u16_2 =
+        wrapper::vorr(vshll_n_u8(wrapper::vgethigh(mask_u8), 8), wrapper::vmovl(wrapper::vgethigh(mask_u8)));
+    auto wide_u32_1 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_1), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_1)));
+    auto wide_u32_2 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_1), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_1)));
+    auto wide_u32_3 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(wide_u16_2), 16), wrapper::vmovl(wrapper::vgetlow(wide_u16_2)));
+    auto wide_u32_4 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(wide_u16_2), 16), wrapper::vmovl(wrapper::vgethigh(wide_u16_2)));
     res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
     res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
     res_idx_mask.val[2] = wrapper::vand(vec_res_idx.val[2], wide_u32_3);
@@ -241,19 +250,19 @@
         pmin      = wrapper::vpmin(pmin, pmin);
         res       = std::min(wrapper::vgetlane(pmin, 0), res);
         iter++;
-    }
-    while(iter < 4);
+    } while (iter < 4);
 
     return (res - 0xFFFFFFFF);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
-uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
+uint32x4x4_t
+calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis)
 {
-    uint32x4x2_t mask{ 0 };
-    uint16x8_t   mask_u16{ 0 };
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    uint32x4x2_t mask{0};
+    uint16x8_t   mask_u16{0};
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         mask_u16 = wrapper::vcgt(b, a);
     }
@@ -263,19 +272,14 @@
     }
     mask.val[0]          = wrapper::vmovl(wrapper::vgetlow(mask_u16));
     mask.val[1]          = wrapper::vmovl(wrapper::vgethigh(mask_u16));
-    uint32x4x2_t vec_idx = { { { idx + 0, idx + 1, idx + 2, idx + 3 },
-            { idx + 4, idx + 5, idx + 6, idx + 7 }
-        }
-    };
-    if(axis != 0)
+    uint32x4x2_t vec_idx = {{{idx + 0, idx + 1, idx + 2, idx + 3}, {idx + 4, idx + 5, idx + 6, idx + 7}}};
+    if (axis != 0)
     {
         vec_idx.val[0] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
         vec_idx.val[1] = wrapper::vdup_n(idx, wrapper::traits::vector_128_tag{});
     }
-    uint32x4x4_t res = { wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
-                         wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]),
-                         0, 0
-                       };
+    uint32x4x4_t res = {wrapper::vbsl(mask.val[0], vec_idx.val[0], c.val[0]),
+                        wrapper::vbsl(mask.val[1], vec_idx.val[1], c.val[1]), 0, 0};
 
     return res;
 }
@@ -298,10 +302,10 @@
 template <>
 uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, float16x8_t vec_res_value, ReductionOperation op)
 {
-    uint32x4x2_t res_idx_mask{ 0 };
+    uint32x4x2_t res_idx_mask{0};
     uint32x4_t   mask_ones = vdupq_n_u32(0xFFFFFFFF);
     uint16x8_t   mask_u16;
-    if(op == ReductionOperation::ARG_IDX_MIN)
+    if (op == ReductionOperation::ARG_IDX_MIN)
     {
         auto pmin = calculate_min(vec_res_value);
         mask_u16  = wrapper::vceq(vec_res_value, wrapper::vcombine(pmin, pmin));
@@ -313,8 +317,10 @@
     }
 
     // Widen vectors
-    auto wide_u32_1     = wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
-    auto wide_u32_2     = wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
+    auto wide_u32_1 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgetlow(mask_u16), 8), wrapper::vmovl(wrapper::vgetlow(mask_u16)));
+    auto wide_u32_2 =
+        wrapper::vorr(vshll_n_u16(wrapper::vgethigh(mask_u16), 8), wrapper::vmovl(wrapper::vgethigh(mask_u16)));
     res_idx_mask.val[0] = wrapper::vand(vec_res_idx.val[0], wide_u32_1);
     res_idx_mask.val[1] = wrapper::vand(vec_res_idx.val[1], wide_u32_2);
     res_idx_mask.val[0] = wrapper::vadd(res_idx_mask.val[0], mask_ones);
@@ -328,8 +334,7 @@
         pmin      = wrapper::vpmin(pmin, pmin);
         res       = std::min(wrapper::vgetlane(pmin, 0), res);
         iter++;
-    }
-    while(iter < 2);
+    } while (iter < 2);
 
     return (res - 0xFFFFFFFF);
 }
@@ -388,7 +393,8 @@
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
     {
         const size_t input_dim_0    = in->info()->dimension(0);
         const int    window_step_x  = 16 / sizeof(T);
@@ -402,211 +408,217 @@
         Iterator output(out, out_window);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
-
-            auto init_res_value = static_cast<T>(0.f);
-            switch(op)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                case ReductionOperation::ARG_IDX_MAX:
-                case ReductionOperation::ARG_IDX_MIN:
-                case ReductionOperation::MIN:
-                case ReductionOperation::MAX:
-                {
-                    init_res_value = static_cast<T>(*input_ptr);
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    init_res_value = static_cast<T>(1.f);
-                    break;
-                }
-                default:
-                    break;
-            }
-            auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
-            uint32x4x4_t vec_res_idx{ { 0 } };
+                const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
 
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vec_elements = wrapper::vloadq(input_ptr + x);
-                switch(op)
+                auto init_res_value = static_cast<T>(0.f);
+                switch (op)
                 {
-                    case ReductionOperation::SUM_SQUARE:
-                        vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                    case ReductionOperation::ARG_IDX_MAX:
+                    case ReductionOperation::ARG_IDX_MIN:
+                    case ReductionOperation::MIN:
+                    case ReductionOperation::MAX:
+                    {
+                        init_res_value = static_cast<T>(*input_ptr);
                         break;
-                    case ReductionOperation::MEAN_SUM:
-                    case ReductionOperation::SUM:
-                        vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
-                        break;
+                    }
                     case ReductionOperation::PROD:
-                        vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                    {
+                        init_res_value = static_cast<T>(1.f);
                         break;
+                    }
+                    default:
+                        break;
+                }
+                auto         vec_res_value = wrapper::vdup_n(init_res_value, ExactTagType{});
+                uint32x4x4_t vec_res_idx{{0}};
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto vec_elements = wrapper::vloadq(input_ptr + x);
+                    switch (op)
+                    {
+                        case ReductionOperation::SUM_SQUARE:
+                            vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                            break;
+                        case ReductionOperation::MEAN_SUM:
+                        case ReductionOperation::SUM:
+                            vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::PROD:
+                            vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                            break;
+                        case ReductionOperation::ARG_IDX_MIN:
+                        {
+                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+                                                                                   vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            vec_res_idx = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value,
+                                                                                   vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        {
+                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                            break;
+                        }
+                        case ReductionOperation::MAX:
+                        {
+                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            break;
+                        }
+                        default:
+                            ARM_COMPUTE_ERROR("Not supported");
+                    }
+                }
+
+                switch (op)
+                {
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                    case ReductionOperation::SUM_SQUARE:
+                    {
+#ifdef ARM_COMPUTE_DEBUG_ENABLED
+                        auto res = static_cast<T>(0.f);
+                        for (int i = 0; i < S; ++i)
+                        {
+                            res += wrapper::vgetlane(vec_res_value, i);
+                        }
+#else  // ARM_COMPUTE_DEBUG_ENABLED
+                        auto carry_res =
+                            wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                        for (int i = 0; i < S / 4; ++i)
+                        {
+                            carry_res = wrapper::vpadd(carry_res, carry_res);
+                        }
+                        auto res = wrapper::vgetlane(carry_res, 0);
+#endif // ARM_COMPUTE_DEBUG_ENABLED
+                        if (op == ReductionOperation::SUM_SQUARE)
+                        {
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                res += (*(input_ptr + x)) * (*(input_ptr + x));
+                            }
+                        }
+                        else
+                        {
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                res += *(input_ptr + x);
+                            }
+                        }
+
+                        if (op == ReductionOperation::MEAN_SUM)
+                        {
+                            res /= input_dim_0;
+                        }
+
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
+                    }
+                    case ReductionOperation::PROD:
+                    {
+                        auto carry_res =
+                            wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
+                        T res = 1;
+                        for (int i = 0; i < S / 2; ++i)
+                        {
+                            res *= wrapper::vgetlane(carry_res, i);
+                        }
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res *= *(input_ptr + x);
+                        }
+
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
+                    }
                     case ReductionOperation::ARG_IDX_MIN:
                     {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
+                        auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) < res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
                         break;
                     }
                     case ReductionOperation::ARG_IDX_MAX:
                     {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
+                        auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) > res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
                         break;
                     }
                     case ReductionOperation::MIN:
                     {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
                         break;
                     }
                     case ReductionOperation::MAX:
                     {
-                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
                         break;
                     }
                     default:
                         ARM_COMPUTE_ERROR("Not supported");
                 }
-            }
-
-            switch(op)
-            {
-                case ReductionOperation::SUM:
-                case ReductionOperation::MEAN_SUM:
-                case ReductionOperation::SUM_SQUARE:
-                {
-#ifdef ARM_COMPUTE_DEBUG_ENABLED
-                    auto res = static_cast<T>(0.f);
-                    for(int i = 0; i < S; ++i)
-                    {
-                        res += wrapper::vgetlane(vec_res_value, i);
-                    }
-#else  // ARM_COMPUTE_DEBUG_ENABLED
-                    auto carry_res = wrapper::vpadd(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                    for(int i = 0; i < S / 4; ++i)
-                    {
-                        carry_res = wrapper::vpadd(carry_res, carry_res);
-                    }
-                    auto res = wrapper::vgetlane(carry_res, 0);
-#endif // ARM_COMPUTE_DEBUG_ENABLED
-                    if(op == ReductionOperation::SUM_SQUARE)
-                    {
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            res += (*(input_ptr + x)) * (*(input_ptr + x));
-                        }
-                    }
-                    else
-                    {
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            res += *(input_ptr + x);
-                        }
-                    }
-
-                    if(op == ReductionOperation::MEAN_SUM)
-                    {
-                        res /= input_dim_0;
-                    }
-
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    auto carry_res = wrapper::vmul(wrapper::vgethigh(vec_res_value), wrapper::vgetlow(vec_res_value));
-                    T    res       = 1;
-                    for(int i = 0; i < S / 2; ++i)
-                    {
-                        res *= wrapper::vgetlane(carry_res, i);
-                    }
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res *= *(input_ptr + x);
-                    }
-
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::ARG_IDX_MIN:
-                {
-                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) < res)
-                        {
-                            idx = x;
-                            res = *(input_ptr + x);
-                        }
-                    }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::ARG_IDX_MAX:
-                {
-                    auto idx = calculate_vector_index<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) > res)
-                        {
-                            idx = x;
-                            res = *(input_ptr + x);
-                        }
-                    }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::MIN:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::MAX:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
 template <typename T>
 struct RedOpX_quantized
 {
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, const ReductionOperation op)
     {
         using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
 
@@ -637,335 +649,102 @@
         const float B = out_offset - (in_scale * in_offset) / (out_scale);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            auto vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-            auto vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
-
-            auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
-            auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
-
-            typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = { 0 };
-
-            if(op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::MIN || op == ReductionOperation::MAX)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
-            }
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
 
-            uint32x4x4_t vec_res_idx{ { 0 } };
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const auto vec_elements = wrapper::vloadq(input_ptr + x);
-                switch(op)
+                auto vec_res_value1 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value2 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value3 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+                auto vec_res_value4 =
+                    wrapper::vdup_n(static_cast<PromotedType>(0.f), wrapper::traits::vector_128_tag{});
+
+                auto vec_res_value1_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value2_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value3_f = vdupq_n_f32(static_cast<float>(1.f));
+                auto vec_res_value4_f = vdupq_n_f32(static_cast<float>(1.f));
+
+                typename wrapper::traits::neon_vector<T, 16>::type vec_res_value = {0};
+
+                if (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN ||
+                    op == ReductionOperation::MIN || op == ReductionOperation::MAX)
                 {
-                    case ReductionOperation::SUM:
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                        vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                        vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                        vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                        vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
-                        const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
-
-                        const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                        const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
-
-                        const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                        const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                        const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                        const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
-
-                        auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                        auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                        auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                        auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
-
-                        //de-quantize vec_elements
-                        temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                        temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                        vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
-                        vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
-                        vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
-                        vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MIN:
-                    {
-                        auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
-                        vec_res_value           = temp_vec_res_value;
-                        break;
-                    }
-                    case ReductionOperation::MIN:
-                    {
-                        vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                        break;
-                    }
-                    case ReductionOperation::MAX:
-                    {
-                        vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                        break;
-                    }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
+                    vec_res_value = wrapper::vdup_n(*input_ptr, wrapper::traits::vector_128_tag{});
                 }
-            }
 
-            switch(op)
-            {
-                case ReductionOperation::ARG_IDX_MIN:
+                uint32x4x4_t vec_res_idx{{0}};
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) < res)
-                        {
-                            idx = x;
-                            res = *(input_ptr + x);
-                        }
-                    }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::ARG_IDX_MAX:
-                {
-                    auto idx = calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        if(*(input_ptr + x) > res)
-                        {
-                            idx = x;
-                            res = *(input_ptr + x);
-                        }
-                    }
-                    *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
-                    break;
-                }
-                case ReductionOperation::MIN:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::MAX:
-                {
-                    auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
-                    }
-                    *(reinterpret_cast<T *>(output.ptr())) = res;
-                    break;
-                }
-                case ReductionOperation::PROD:
-                {
-                    auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
-                    carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
-                    carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
-
-                    float res = wrapper::vgetlane(carry_res, 0);
-                    res *= wrapper::vgetlane(carry_res, 1);
-                    res *= wrapper::vgetlane(carry_res, 2);
-                    res *= wrapper::vgetlane(carry_res, 3);
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        //de-quantize input
-                        if(std::is_same<T, uint8_t>::value)
-                        {
-                            res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
-                        }
-                        else
-                        {
-                            res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
-                        }
-                    }
-
-                    //re-quantize result
-                    if(std::is_same<T, uint8_t>::value)
-                    {
-                        res = quantize_qasymm8(res, iq_info);
-                    }
-                    else
-                    {
-                        res = quantize_qasymm8_signed(res, iq_info);
-                    }
-
-                    *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
-                    break;
-                }
-                case ReductionOperation::SUM:
-                case ReductionOperation::MEAN_SUM:
-                {
-                    auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
-                    carry_res      = wrapper::vadd(carry_res, vec_res_value3);
-                    carry_res      = wrapper::vadd(carry_res, vec_res_value4);
-
-                    auto carry_paddition = wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
-                    carry_paddition      = wrapper::vpadd(carry_paddition, carry_paddition);
-                    auto res             = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
-
-                    // Compute left-over elements
-                    for(; x < window_end_x; ++x)
-                    {
-                        res += *(input_ptr + x);
-                    }
-
-                    if(op == ReductionOperation::MEAN_SUM)
-                    {
-                        const int32_t resFinal = A * (static_cast<float>(res)) + B;
-
-                        *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
-                    }
-                    else
-                    {
-                        // Subtract accumulated offsets
-                        res -= (in_info.dimension(0) - 1) * iq_info.offset;
-                        *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
-                    }
-
-                    break;
-                }
-                default:
-                    ARM_COMPUTE_ERROR("Not supported");
-            }
-        },
-        input, output);
-    }
-};
-
-template <typename T, int S>
-struct RedOpYZW
-{
-    /** SIMD vector tag type. */
-    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
-    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
-
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
-    {
-        const TensorInfo in_info            = *(in->info());
-        const int        window_step_x      = 16 / sizeof(T);
-        const auto       window_start_x_tmp = static_cast<int>(in_window.x().start());
-        const auto       window_end_x_tmp   = static_cast<int>(in_window.x().end());
-        // As it split over x-axis, need to set the correct spiltted window start and end.
-        const auto window_start_x = static_cast<int>(0);
-        const auto window_end_x   = static_cast<int>(in_window.shape().x());
-
-        Window in_win_no_pad = in_window;
-        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
-        Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
-
-        Iterator input(in, in_win_no_pad);
-        Iterator output(out, out_win_no_pad);
-
-        execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                neon_vector vec_res_value = { 0 };
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        vec_res_value = wrapper::vloadq(input_ptr + x);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
-                        break;
-                    }
-                    default:
-                    {
-                        vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                        break;
-                    }
-                }
-                uint32x4x4_t vec_res_idx{ { 0 } };
-
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
-                {
-                    const T   *in_ptr       = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-                    const auto vec_elements = wrapper::vloadq(in_ptr);
-                    switch(op)
+                    const auto vec_elements = wrapper::vloadq(input_ptr + x);
+                    switch (op)
                     {
                         case ReductionOperation::SUM:
                         case ReductionOperation::MEAN_SUM:
-                            vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                        {
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                            vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                            vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                            vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                            vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
                             break;
-                        case ReductionOperation::SUM_SQUARE:
-                            vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
-                            break;
+                        }
                         case ReductionOperation::PROD:
-                            vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                        {
+                            const auto offset32x4f_4 = vdupq_n_f32(iq_info.offset);
+                            const auto scale32x4f_4  = vdupq_n_f32(iq_info.scale);
+
+                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                            auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                            auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                            auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                            auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                            //de-quantize vec_elements
+                            temp32x4f_1 = vmulq_f32(vsubq_f32(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_2 = vmulq_f32(vsubq_f32(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_3 = vmulq_f32(vsubq_f32(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                            temp32x4f_4 = vmulq_f32(vsubq_f32(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                            vec_res_value1_f = vmulq_f32(temp32x4f_1, vec_res_value1_f);
+                            vec_res_value2_f = vmulq_f32(temp32x4f_2, vec_res_value2_f);
+                            vec_res_value3_f = vmulq_f32(temp32x4f_3, vec_res_value3_f);
+                            vec_res_value4_f = vmulq_f32(temp32x4f_4, vec_res_value4_f);
                             break;
+                        }
                         case ReductionOperation::ARG_IDX_MIN:
                         {
                             auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
+                            vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(
+                                x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
                             break;
                         }
                         case ReductionOperation::ARG_IDX_MAX:
                         {
                             auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
+                            vec_res_idx             = calculate_index_quantized<decltype(vec_res_value)>(
+                                x, temp_vec_res_value, vec_res_value, vec_res_idx, op, 0);
+                            vec_res_value = temp_vec_res_value;
                             break;
                         }
                         case ReductionOperation::MIN:
@@ -983,120 +762,376 @@
                     }
                 }
 
-                if(op == ReductionOperation::MEAN_SUM)
+                switch (op)
                 {
-                    auto vec_width_inv = wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
-                    vec_res_value      = wrapper::vmul(vec_res_value, vec_width_inv);
-                }
-
-                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
-                {
-                    wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
-#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                    if(std::is_same<T, float16_t>::value)
-                    {
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
-                    }
-#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-                }
-                else
-                {
-                    wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
-                }
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                auto res_value = 0.f;
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MAX:
                     case ReductionOperation::ARG_IDX_MIN:
+                    {
+                        auto idx =
+                            calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) < res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
+                    }
+                    case ReductionOperation::ARG_IDX_MAX:
+                    {
+                        auto idx =
+                            calculate_vector_index_quantized<decltype(vec_res_value)>(vec_res_idx, vec_res_value, op);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            if (*(input_ptr + x) > res)
+                            {
+                                idx = x;
+                                res = *(input_ptr + x);
+                            }
+                        }
+                        *(reinterpret_cast<uint32_t *>(output.ptr())) = idx;
+                        break;
+                    }
                     case ReductionOperation::MIN:
+                    {
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_min(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) < res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
+                        break;
+                    }
                     case ReductionOperation::MAX:
                     {
-                        res_value = *(input_ptr + x);
+                        auto res = static_cast<T>(wrapper::vgetlane(calculate_max(vec_res_value), 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res = *(input_ptr + x) > res ? *(input_ptr + x) : res;
+                        }
+                        *(reinterpret_cast<T *>(output.ptr())) = res;
                         break;
                     }
                     case ReductionOperation::PROD:
                     {
-                        res_value = static_cast<T>(1.f);
+                        auto carry_res = wrapper::vmul(vec_res_value1_f, vec_res_value2_f);
+                        carry_res      = wrapper::vmul(carry_res, vec_res_value3_f);
+                        carry_res      = wrapper::vmul(carry_res, vec_res_value4_f);
+
+                        float res = wrapper::vgetlane(carry_res, 0);
+                        res *= wrapper::vgetlane(carry_res, 1);
+                        res *= wrapper::vgetlane(carry_res, 2);
+                        res *= wrapper::vgetlane(carry_res, 3);
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            //de-quantize input
+                            if (std::is_same<T, uint8_t>::value)
+                            {
+                                res *= dequantize_qasymm8(*(input_ptr + x), iq_info);
+                            }
+                            else
+                            {
+                                res *= dequantize_qasymm8_signed(*(input_ptr + x), iq_info);
+                            }
+                        }
+
+                        //re-quantize result
+                        if (std::is_same<T, uint8_t>::value)
+                        {
+                            res = quantize_qasymm8(res, iq_info);
+                        }
+                        else
+                        {
+                            res = quantize_qasymm8_signed(res, iq_info);
+                        }
+
+                        *reinterpret_cast<T *>(output.ptr()) = static_cast<T>(res);
+                        break;
+                    }
+                    case ReductionOperation::SUM:
+                    case ReductionOperation::MEAN_SUM:
+                    {
+                        auto carry_res = wrapper::vadd(vec_res_value1, vec_res_value2);
+                        carry_res      = wrapper::vadd(carry_res, vec_res_value3);
+                        carry_res      = wrapper::vadd(carry_res, vec_res_value4);
+
+                        auto carry_paddition =
+                            wrapper::vpadd(wrapper::vgethigh(carry_res), wrapper::vgetlow(carry_res));
+                        carry_paddition = wrapper::vpadd(carry_paddition, carry_paddition);
+                        auto res        = static_cast<int32_t>(wrapper::vgetlane(carry_paddition, 0));
+
+                        // Compute left-over elements
+                        for (; x < window_end_x; ++x)
+                        {
+                            res += *(input_ptr + x);
+                        }
+
+                        if (op == ReductionOperation::MEAN_SUM)
+                        {
+                            const int32_t resFinal = A * (static_cast<float>(res)) + B;
+
+                            *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(resFinal);
+                        }
+                        else
+                        {
+                            // Subtract accumulated offsets
+                            res -= (in_info.dimension(0) - 1) * iq_info.offset;
+                            *reinterpret_cast<T *>(output.ptr()) = utils::cast::saturate_cast<T>(res);
+                        }
+
                         break;
                     }
                     default:
-                    {
-                        res_value = static_cast<T>(0.f);
-                        break;
-                    }
+                        ARM_COMPUTE_ERROR("Not supported");
                 }
+            },
+            input, output);
+    }
+};
 
-                uint32_t res_idx = 0;
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+template <typename T, int S>
+struct RedOpYZW
+{
+    /** SIMD vector tag type. */
+    using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
+    using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
+
+    inline void operator()(const Window            &in_window,
+                           Window                  &out_window,
+                           const ITensor           *in,
+                           ITensor                 *out,
+                           int                      axis,
+                           const ReductionOperation op)
+    {
+        const TensorInfo in_info            = *(in->info());
+        const int        window_step_x      = 16 / sizeof(T);
+        const auto       window_start_x_tmp = static_cast<int>(in_window.x().start());
+        const auto       window_end_x_tmp   = static_cast<int>(in_window.x().end());
+        // As it split over x-axis, need to set the correct spiltted window start and end.
+        const auto window_start_x = static_cast<int>(0);
+        const auto window_end_x   = static_cast<int>(in_window.shape().x());
+
+        Window in_win_no_pad = in_window;
+        in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
+        Window out_win_no_pad = out_window;
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+
+        Iterator input(in, in_win_no_pad);
+        Iterator output(out, out_win_no_pad);
+
+        execute_window_loop(
+            in_win_no_pad,
+            [&](const Coordinates &)
+            {
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
-
-                    switch(op)
+                    neon_vector vec_res_value = {0};
+                    switch (op)
                     {
-                        case ReductionOperation::SUM:
-                        case ReductionOperation::MEAN_SUM:
-                            res_value += *in_ptr;
-                            break;
-                        case ReductionOperation::SUM_SQUARE:
-                            res_value += *in_ptr * *in_ptr;
-                            break;
-                        case ReductionOperation::PROD:
-                            res_value *= *in_ptr;
-                            break;
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            if(*in_ptr < res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
-                            break;
-                        }
                         case ReductionOperation::ARG_IDX_MAX:
-                        {
-                            if(*in_ptr > res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
-                            break;
-                        }
+                        case ReductionOperation::ARG_IDX_MIN:
                         case ReductionOperation::MIN:
-                        {
-                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
-                            break;
-                        }
                         case ReductionOperation::MAX:
                         {
-                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                            vec_res_value = wrapper::vloadq(input_ptr + x);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            vec_res_value = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
                             break;
                         }
                         default:
-                            ARM_COMPUTE_ERROR("Not supported");
+                        {
+                            vec_res_value = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                            break;
+                        }
+                    }
+                    uint32x4x4_t vec_res_idx{{0}};
+
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+                        const auto vec_elements = wrapper::vloadq(in_ptr);
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                                vec_res_value = wrapper::vadd(vec_elements, vec_res_value);
+                                break;
+                            case ReductionOperation::SUM_SQUARE:
+                                vec_res_value = wrapper::vadd(wrapper::vmul(vec_elements, vec_elements), vec_res_value);
+                                break;
+                            case ReductionOperation::PROD:
+                                vec_res_value = wrapper::vmul(vec_elements, vec_res_value);
+                                break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                vec_res_idx =
+                                    calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                vec_res_idx =
+                                    calculate_index(dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
+                    }
+
+                    if (op == ReductionOperation::MEAN_SUM)
+                    {
+                        auto vec_width_inv =
+                            wrapper::vinv(wrapper::vdup_n(static_cast<T>(in_info.dimension(axis)), ExactTagType{}));
+                        vec_res_value = wrapper::vmul(vec_res_value, vec_width_inv);
+                    }
+
+                    if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+                    {
+                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x, vec_res_idx.val[0]);
+#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                        if (std::is_same<T, float16_t>::value)
+                        {
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr()) + x + 4, vec_res_idx.val[1]);
+                        }
+#endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
+                    }
+                    else
+                    {
+                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x * sizeof(T)), vec_res_value);
                     }
                 }
 
-                if(op == ReductionOperation::MEAN_SUM)
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    res_value /= in_info.dimension(axis);
-                }
+                    auto res_value = 0.f;
+                    switch (op)
+                    {
+                        case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
+                        {
+                            res_value = *(input_ptr + x);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            res_value = static_cast<T>(1.f);
+                            break;
+                        }
+                        default:
+                        {
+                            res_value = static_cast<T>(0.f);
+                            break;
+                        }
+                    }
 
-                if(op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
-                {
-                    *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
+                    uint32_t res_idx = 0;
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x * sizeof(T) + in_info.strides_in_bytes()[axis] * dim);
+
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                                res_value += *in_ptr;
+                                break;
+                            case ReductionOperation::SUM_SQUARE:
+                                res_value += *in_ptr * *in_ptr;
+                                break;
+                            case ReductionOperation::PROD:
+                                res_value *= *in_ptr;
+                                break;
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                if (*in_ptr < res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                if (*in_ptr > res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
+                    }
+
+                    if (op == ReductionOperation::MEAN_SUM)
+                    {
+                        res_value /= in_info.dimension(axis);
+                    }
+
+                    if (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX)
+                    {
+                        *(reinterpret_cast<uint32_t *>(output.ptr()) + x) = res_idx;
+                    }
+                    else
+                    {
+                        *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
+                    }
                 }
-                else
-                {
-                    *(reinterpret_cast<T *>(output.ptr() + x * sizeof(T))) = res_value;
-                }
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
@@ -1107,7 +1142,8 @@
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
     using neon_vector  = typename wrapper::traits::neon_vector<T, S>::type;
 
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
+    inline void operator()(
+        const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int, const ReductionOperation)
     {
         ARM_COMPUTE_ERROR_ON(axis != 2);
         ARM_COMPUTE_ERROR_ON(op != ReductionOperation::SUM);
@@ -1124,70 +1160,77 @@
         Window in_win_no_pad = in_window;
         in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
         Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
 
         Iterator input(in, in_win_no_pad);
         Iterator output(out, out_win_no_pad);
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                neon_vector vec_res_value_0 = { 0 };
-                neon_vector vec_res_value_1 = { 0 };
-
-                vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-                vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
-
-                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
-                    T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
+                    neon_vector vec_res_value_0 = {0};
+                    neon_vector vec_res_value_1 = {0};
 
-                    const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
-                    const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+                    vec_res_value_0 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
+                    vec_res_value_1 = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
 
-                    vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
-                    vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
+                    T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        T *in_ptr_0 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                        T *in_ptr_1 = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + 16 + stride_z * dim);
+
+                        const auto vec_elements_0 = wrapper::vloadq(in_ptr_0);
+                        const auto vec_elements_1 = wrapper::vloadq(in_ptr_1);
+
+                        vec_res_value_0 = wrapper::vadd(vec_elements_0, vec_res_value_0);
+                        vec_res_value_1 = wrapper::vadd(vec_elements_1, vec_res_value_1);
+                    }
+
+                    wrapper::vstore(out_ptr, vec_res_value_0);
+                    wrapper::vstore(out_ptr + 4, vec_res_value_1);
                 }
 
-                wrapper::vstore(out_ptr, vec_res_value_0);
-                wrapper::vstore(out_ptr + 4, vec_res_value_1);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                auto res_value_0 = 0.f;
-                auto res_value_1 = 0.f;
-
-                T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
-                    res_value_0 += *in_ptr;
-                    res_value_1 += *(in_ptr + 1);
+                    auto res_value_0 = 0.f;
+                    auto res_value_1 = 0.f;
+
+                    T *out_ptr = reinterpret_cast<T *>(output.ptr() + 2 * x * sizeof(T));
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        T *in_ptr = reinterpret_cast<T *>(input.ptr() + 2 * x * sizeof(T) + stride_z * dim);
+                        res_value_0 += *in_ptr;
+                        res_value_1 += *(in_ptr + 1);
+                    }
+                    *out_ptr       = res_value_0;
+                    *(out_ptr + 1) = res_value_1;
                 }
-                *out_ptr       = res_value_0;
-                *(out_ptr + 1) = res_value_1;
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
 template <typename T>
 struct RedOpYZW_quantized
 {
-    inline void operator()(const Window &in_window, Window &out_window, const ITensor *in, ITensor *out, int axis, const ReductionOperation op)
+    inline void operator()(const Window            &in_window,
+                           Window                  &out_window,
+                           const ITensor           *in,
+                           ITensor                 *out,
+                           int                      axis,
+                           const ReductionOperation op)
     {
         const TensorInfo              in_info = *(in->info());
         const UniformQuantizationInfo iq_info = in_info.quantization_info().uniform();
-        using PromotedType                    = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
+        using PromotedType = typename wrapper::traits::promote<typename wrapper::traits::promote<T>::type>::type;
 
         const auto oq_info = out->info()->quantization_info().uniform();
 
@@ -1201,12 +1244,14 @@
         Window in_win_no_pad = in_window;
         in_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, in_window.shape().x()));
         Window out_win_no_pad = out_window;
-        out_win_no_pad.set(Window::DimX, Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
+        out_win_no_pad.set(Window::DimX,
+                           Window::Dimension(window_start_x_tmp, window_end_x_tmp, out_window.shape().x()));
 
         Iterator input(in, in_win_no_pad);
         Iterator output(out, out_win_no_pad);
 
-        using vector_type   = typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
+        using vector_type =
+            typename wrapper::traits::neon_bitvector<PromotedType, wrapper::traits::BitWidth::W128>::type;
         using vector_type_f = typename wrapper::traits::neon_vector<float, 4>::type;
 
         vector_type vec_res_value1{};
@@ -1234,102 +1279,211 @@
         const auto vec_B = wrapper::vdup_n(static_cast<float>(B), wrapper::traits::vector_128_tag{});
 
         execute_window_loop(
-            in_win_no_pad, [&](const Coordinates &)
-        {
-            const auto input_ptr = reinterpret_cast<T *>(input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            in_win_no_pad,
+            [&](const Coordinates &)
             {
-                uint32x4x4_t vec_res_idx{ { 0 } };
-                vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
-                vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                const auto input_ptr = reinterpret_cast<T *>(input.ptr());
 
-                vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-                vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
-
-                auto vec_res_value = wrapper::vloadq(input_ptr + x);
-
-                for(unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    const T   *in_ptr       = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
-                    const auto vec_elements = wrapper::vloadq(in_ptr);
-                    switch(op)
+                    uint32x4x4_t vec_res_idx{{0}};
+                    vec_res_value1 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value2 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value3 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+                    vec_res_value4 = wrapper::vdup_n(static_cast<PromotedType>(0), wrapper::traits::vector_128_tag{});
+
+                    vec_res_value1_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value2_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value3_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+                    vec_res_value4_f = wrapper::vdup_n(static_cast<float>(1), wrapper::traits::vector_128_tag{});
+
+                    auto vec_res_value = wrapper::vloadq(input_ptr + x);
+
+                    for (unsigned int index_dim = 0; index_dim < in_info.dimension(axis); ++index_dim)
                     {
+                        const T   *in_ptr       = input_ptr + x + in_info.strides_in_bytes()[axis] * index_dim;
+                        const auto vec_elements = wrapper::vloadq(in_ptr);
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
+                            case ReductionOperation::MEAN_SUM:
+                            {
+                                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                                vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
+                                vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
+                                vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
+                                vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                                break;
+                            }
+                            case ReductionOperation::PROD:
+                            {
+                                const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset),
+                                                                           wrapper::traits::vector_128_tag{});
+                                const auto scale32x4f_4 =
+                                    wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
+
+                                const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
+                                const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+
+                                const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
+                                const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
+                                const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
+                                const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+
+                                auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
+                                auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
+                                auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
+                                auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+
+                                //de-quantize vec_elements
+                                temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
+                                temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
+
+                                vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
+                                vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
+                                vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
+                                vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                vec_res_idx   = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+                                                                          vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                vec_res_idx   = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value,
+                                                                          vec_res_idx, op, axis);
+                                vec_res_value = temp_vec_res_value;
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
+                    }
+
+                    switch (op)
+                    {
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::ARG_IDX_MAX:
+                        {
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
+                            wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12,
+                                            vec_res_idx.val[3]);
+                            break;
+                        }
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
+                        {
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
+                            break;
+                        }
                         case ReductionOperation::SUM:
+                        {
+                            // Subtract offsets
+                            auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
+
+                            auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
+                            auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
+                            auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
+                            auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
+
+                            vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
+                            vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
+                            vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
+                            vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
+
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
+
+                            combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
+                            break;
+                        }
                         case ReductionOperation::MEAN_SUM:
                         {
-                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+                            vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
+                            vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
+                            vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
+                            vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
 
-                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+#ifdef __aarch64__
+                            vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
+                            vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
+                            vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
+                            vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
+#else  // defined(__aarch64__)
+                            vec_res_value1    = wrapper::vcvt<PromotedType>(vec_res_value1_f);
+                            vec_res_value2    = wrapper::vcvt<PromotedType>(vec_res_value2_f);
+                            vec_res_value3    = wrapper::vcvt<PromotedType>(vec_res_value3_f);
+                            vec_res_value4    = wrapper::vcvt<PromotedType>(vec_res_value4_f);
+#endif // __aarch64__
 
-                            vec_res_value1 = wrapper::vadd(temp32x4t_1, vec_res_value1);
-                            vec_res_value2 = wrapper::vadd(temp32x4t_2, vec_res_value2);
-                            vec_res_value3 = wrapper::vadd(temp32x4t_3, vec_res_value3);
-                            vec_res_value4 = wrapper::vadd(temp32x4t_4, vec_res_value4);
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                            auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
+
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
                             break;
                         }
                         case ReductionOperation::PROD:
                         {
-                            const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                            const auto scale32x4f_4  = wrapper::vdup_n(iq_info.scale, wrapper::traits::vector_128_tag{});
+                            const auto offset32x4f_4 =
+                                wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
+                            const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
 
-                            const auto temp16x8t_1 = wrapper::vmovl(wrapper::vgetlow(vec_elements));
-                            const auto temp16x8t_2 = wrapper::vmovl(wrapper::vgethigh(vec_elements));
+                            //re-quantize
+                            vec_res_value1_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value2_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value3_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
+                            vec_res_value4_f =
+                                wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
 
-                            const auto temp32x4t_1 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_1));
-                            const auto temp32x4t_2 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_1));
-                            const auto temp32x4t_3 = wrapper::vmovl(wrapper::vgetlow(temp16x8t_2));
-                            const auto temp32x4t_4 = wrapper::vmovl(wrapper::vgethigh(temp16x8t_2));
+                            vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
+                            vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
+                            vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
+                            vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
 
-                            auto temp32x4f_1 = wrapper::vcvt<float>(temp32x4t_1);
-                            auto temp32x4f_2 = wrapper::vcvt<float>(temp32x4t_2);
-                            auto temp32x4f_3 = wrapper::vcvt<float>(temp32x4t_3);
-                            auto temp32x4f_4 = wrapper::vcvt<float>(temp32x4t_4);
+                            const auto temp16x8t_1 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
+                            const auto temp16x8t_2 =
+                                wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
+                            auto res = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
 
-                            //de-quantize vec_elements
-                            temp32x4f_1 = wrapper::vmul(wrapper::vsub(temp32x4f_1, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_2 = wrapper::vmul(wrapper::vsub(temp32x4f_2, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_3 = wrapper::vmul(wrapper::vsub(temp32x4f_3, offset32x4f_4), scale32x4f_4);
-                            temp32x4f_4 = wrapper::vmul(wrapper::vsub(temp32x4f_4, offset32x4f_4), scale32x4f_4);
-
-                            vec_res_value1_f = wrapper::vmul(temp32x4f_1, vec_res_value1_f);
-                            vec_res_value2_f = wrapper::vmul(temp32x4f_2, vec_res_value2_f);
-                            vec_res_value3_f = wrapper::vmul(temp32x4f_3, vec_res_value3_f);
-                            vec_res_value4_f = wrapper::vmul(temp32x4f_4, vec_res_value4_f);
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            auto temp_vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
-                        case ReductionOperation::ARG_IDX_MAX:
-                        {
-                            auto temp_vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
-                            vec_res_idx             = calculate_index_quantized(index_dim, temp_vec_res_value, vec_res_value, vec_res_idx, op, axis);
-                            vec_res_value           = temp_vec_res_value;
-                            break;
-                        }
-                        case ReductionOperation::MIN:
-                        {
-                            vec_res_value = wrapper::vmin(vec_elements, vec_res_value);
-                            break;
-                        }
-                        case ReductionOperation::MAX:
-                        {
-                            vec_res_value = wrapper::vmax(vec_elements, vec_res_value);
+                            wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
                             break;
                         }
                         default:
@@ -1337,259 +1491,172 @@
                     }
                 }
 
-                switch(op)
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::ARG_IDX_MAX:
+                    float   res_value   = 0.f;
+                    int32_t res_value_q = 0;
+
+                    switch (op)
                     {
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x), vec_res_idx.val[0]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 4, vec_res_idx.val[1]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 8, vec_res_idx.val[2]);
-                        wrapper::vstore(reinterpret_cast<uint32_t *>(output.ptr() + 4 * x) + 12, vec_res_idx.val[3]);
-                        break;
-                    }
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), vec_res_value);
-                        break;
-                    }
-                    case ReductionOperation::SUM:
-                    {
-                        // Subtract offsets
-                        auto offsets = vdupq_n_s32((in_info.dimension(axis) - 1) * iq_info.offset);
-
-                        auto vec_res_s_value1 = wrapper::vreinterpret(vec_res_value1);
-                        auto vec_res_s_value2 = wrapper::vreinterpret(vec_res_value2);
-                        auto vec_res_s_value3 = wrapper::vreinterpret(vec_res_value3);
-                        auto vec_res_s_value4 = wrapper::vreinterpret(vec_res_value4);
-
-                        vec_res_s_value1 = wrapper::vsub(vec_res_s_value1, offsets);
-                        vec_res_s_value2 = wrapper::vsub(vec_res_s_value2, offsets);
-                        vec_res_s_value3 = wrapper::vsub(vec_res_s_value3, offsets);
-                        vec_res_s_value4 = wrapper::vsub(vec_res_s_value4, offsets);
-
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value1), wrapper::vqmovn(vec_res_s_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_s_value3), wrapper::vqmovn(vec_res_s_value4));
-
-                        combine_and_store<T>(temp16x8t_1, temp16x8t_2, output, x);
-                        break;
-                    }
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        vec_res_value1_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value1), vec_A);
-                        vec_res_value2_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value2), vec_A);
-                        vec_res_value3_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value3), vec_A);
-                        vec_res_value4_f = wrapper::vmla(vec_B, wrapper::vcvt<float>(vec_res_value4), vec_A);
-
-#ifdef __aarch64__
-                        vec_res_value1 = wrapper::vcvta<PromotedType>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvta<PromotedType>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvta<PromotedType>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvta<PromotedType>(vec_res_value4_f);
-#else  // defined(__aarch64__)
-                        vec_res_value1 = wrapper::vcvt<PromotedType>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvt<PromotedType>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvt<PromotedType>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvt<PromotedType>(vec_res_value4_f);
-#endif // __aarch64__
-
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
-                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        const auto offset32x4f_4 = wrapper::vdup_n(static_cast<float>(iq_info.offset), wrapper::traits::vector_128_tag{});
-                        const auto iscale32x4f_4 = vinvq_f32(vdupq_n_f32(iq_info.scale));
-
-                        //re-quantize
-                        vec_res_value1_f = wrapper::vadd(wrapper::vmul(vec_res_value1_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value2_f = wrapper::vadd(wrapper::vmul(vec_res_value2_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value3_f = wrapper::vadd(wrapper::vmul(vec_res_value3_f, iscale32x4f_4), offset32x4f_4);
-                        vec_res_value4_f = wrapper::vadd(wrapper::vmul(vec_res_value4_f, iscale32x4f_4), offset32x4f_4);
-
-                        vec_res_value1 = wrapper::vcvt<T>(vec_res_value1_f);
-                        vec_res_value2 = wrapper::vcvt<T>(vec_res_value2_f);
-                        vec_res_value3 = wrapper::vcvt<T>(vec_res_value3_f);
-                        vec_res_value4 = wrapper::vcvt<T>(vec_res_value4_f);
-
-                        const auto temp16x8t_1 = wrapper::vcombine(wrapper::vqmovn(vec_res_value1), wrapper::vqmovn(vec_res_value2));
-                        const auto temp16x8t_2 = wrapper::vcombine(wrapper::vqmovn(vec_res_value3), wrapper::vqmovn(vec_res_value4));
-                        auto       res         = wrapper::vcombine(wrapper::vqmovn(temp16x8t_1), wrapper::vqmovn(temp16x8t_2));
-
-                        wrapper::vstore(reinterpret_cast<T *>(output.ptr() + x), res);
-                        break;
-                    }
-                    default:
-                        ARM_COMPUTE_ERROR("Not supported");
-                }
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                float   res_value   = 0.f;
-                int32_t res_value_q = 0;
-
-                switch(op)
-                {
-                    case ReductionOperation::ARG_IDX_MAX:
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::MIN:
-                    case ReductionOperation::MAX:
-                    {
-                        res_value = *(input_ptr + x);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        res_value = static_cast<T>(1.0f);
-                        break;
-                    }
-                    default:
-                    {
-                        res_value = static_cast<T>(0.0f);
-                        break;
-                    }
-                }
-                uint32_t res_idx = 0;
-
-                for(unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
-                {
-                    const T *in_ptr = reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
-                    switch(op)
-                    {
-                        case ReductionOperation::SUM:
+                        case ReductionOperation::ARG_IDX_MAX:
+                        case ReductionOperation::ARG_IDX_MIN:
+                        case ReductionOperation::MIN:
+                        case ReductionOperation::MAX:
                         {
-                            res_value += *in_ptr;
-                            break;
-                        }
-                        case ReductionOperation::MEAN_SUM:
-                        {
-                            res_value_q += *in_ptr;
-                            break;
-                        }
-                        case ReductionOperation::SUM_SQUARE:
-                        {
-                            res_value += *in_ptr * *in_ptr;
+                            res_value = *(input_ptr + x);
                             break;
                         }
                         case ReductionOperation::PROD:
                         {
-                            //de-quantize input
-                            if(std::is_same<T, uint8_t>::value)
+                            res_value = static_cast<T>(1.0f);
+                            break;
+                        }
+                        default:
+                        {
+                            res_value = static_cast<T>(0.0f);
+                            break;
+                        }
+                    }
+                    uint32_t res_idx = 0;
+
+                    for (unsigned int dim = 0; dim < in_info.dimension(axis); ++dim)
+                    {
+                        const T *in_ptr =
+                            reinterpret_cast<T *>(input.ptr() + x + in_info.strides_in_bytes()[axis] * dim);
+                        switch (op)
+                        {
+                            case ReductionOperation::SUM:
                             {
-                                res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+                                res_value += *in_ptr;
+                                break;
+                            }
+                            case ReductionOperation::MEAN_SUM:
+                            {
+                                res_value_q += *in_ptr;
+                                break;
+                            }
+                            case ReductionOperation::SUM_SQUARE:
+                            {
+                                res_value += *in_ptr * *in_ptr;
+                                break;
+                            }
+                            case ReductionOperation::PROD:
+                            {
+                                //de-quantize input
+                                if (std::is_same<T, uint8_t>::value)
+                                {
+                                    res_value *= dequantize_qasymm8(*in_ptr, iq_info);
+                                }
+                                else
+                                {
+                                    res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MIN:
+                            {
+                                if (*in_ptr < res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::ARG_IDX_MAX:
+                            {
+                                if (*in_ptr > res_value)
+                                {
+                                    res_value = *in_ptr;
+                                    res_idx   = dim;
+                                }
+                                break;
+                            }
+                            case ReductionOperation::MIN:
+                            {
+                                res_value = *in_ptr < res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            case ReductionOperation::MAX:
+                            {
+                                res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                                break;
+                            }
+                            default:
+                                ARM_COMPUTE_ERROR("Not supported");
+                        }
+                    }
+
+                    switch (op)
+                    {
+                        case ReductionOperation::MEAN_SUM:
+                        {
+                        // Apply previously calculated coefficients (with rounding on aarch64)
+#ifdef __aarch64__
+                            const int32_t res =
+                                arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
+#else  // defined(__aarch64__)
+                            const int32_t res = A * (static_cast<float>(res_value_q)) + B;
+#endif // __aarch64__
+                            *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
+                            break;
+                        }
+                        case ReductionOperation::SUM:
+                        {
+                            // Subtract accumulated offsets
+                            res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
+                            *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
+                            break;
+                        }
+                        case ReductionOperation::PROD:
+                        {
+                            //re-quantize result
+                            T res = 0;
+                            if (std::is_same<T, uint8_t>::value)
+                            {
+                                res = quantize_qasymm8(res_value, iq_info);
                             }
                             else
                             {
-                                res_value *= dequantize_qasymm8_signed(*in_ptr, iq_info);
+                                res = quantize_qasymm8_signed(res_value, iq_info);
                             }
+                            *(reinterpret_cast<T *>(output.ptr() + x)) = res;
                             break;
                         }
                         case ReductionOperation::ARG_IDX_MIN:
-                        {
-                            if(*in_ptr < res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
-                            break;
-                        }
                         case ReductionOperation::ARG_IDX_MAX:
                         {
-                            if(*in_ptr > res_value)
-                            {
-                                res_value = *in_ptr;
-                                res_idx   = dim;
-                            }
-                            break;
-                        }
-                        case ReductionOperation::MIN:
-                        {
-                            res_value = *in_ptr < res_value ? *in_ptr : res_value;
-                            break;
-                        }
-                        case ReductionOperation::MAX:
-                        {
-                            res_value = *in_ptr > res_value ? *in_ptr : res_value;
+                            *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
                             break;
                         }
                         default:
-                            ARM_COMPUTE_ERROR("Not supported");
+                            *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
                     }
                 }
-
-                switch(op)
-                {
-                    case ReductionOperation::MEAN_SUM:
-                    {
-                        // Apply previously calculated coefficients (with rounding on aarch64)
-#ifdef  __aarch64__
-                        const int32_t res                        = arm_compute::support::cpp11::round(A * (static_cast<float>(res_value_q)) + B);
-#else   // defined(__aarch64__)
-                        const int32_t res                        = A * (static_cast<float>(res_value_q)) + B;
-#endif  // __aarch64__
-                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res);
-                        break;
-                    }
-                    case ReductionOperation::SUM:
-                    {
-                        // Subtract accumulated offsets
-                        res_value -= (in_info.dimension(axis) - 1) * iq_info.offset;
-                        *reinterpret_cast<T *>(output.ptr() + x) = utils::cast::saturate_cast<T>(res_value);
-                        break;
-                    }
-                    case ReductionOperation::PROD:
-                    {
-                        //re-quantize result
-                        T res = 0;
-                        if(std::is_same<T, uint8_t>::value)
-                        {
-                            res = quantize_qasymm8(res_value, iq_info);
-                        }
-                        else
-                        {
-                            res = quantize_qasymm8_signed(res_value, iq_info);
-                        }
-                        *(reinterpret_cast<T *>(output.ptr() + x)) = res;
-                        break;
-                    }
-                    case ReductionOperation::ARG_IDX_MIN:
-                    case ReductionOperation::ARG_IDX_MAX:
-                    {
-                        *(reinterpret_cast<uint32_t *>(output.ptr() + x * 4)) = res_idx;
-                        break;
-                    }
-                    default:
-                        *(reinterpret_cast<T *>(output.ptr() + x)) = res_value;
-                }
-            }
-        },
-        input, output);
+            },
+            input, output);
     }
 };
 
-void reduce_op(const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
+void reduce_op(
+    const Window &window, const ITensor *input, ITensor *output, unsigned int axis, const ReductionOperation op)
 {
     const bool is_complex = (input->info()->num_channels() == 2);
 
-    if(is_complex)
+    if (is_complex)
     {
-        switch(axis)
+        switch (axis)
         {
             case 2:
-                switch(input->info()->data_type())
+                switch (input->info()->data_type())
                 {
                     case DataType::F32:
-                        switch(op)
+                        switch (op)
                         {
                             case ReductionOperation::SUM:
-                                return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(), op);
+                                return Reducer<RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>>::reduceZ(
+                                    window, input, output, RedOpYZW_complex<float, 4, 2, ReductionOperation::SUM>(),
+                                    op);
                             default:
                                 ARM_COMPUTE_ERROR("Not supported");
                         }
@@ -1602,19 +1669,21 @@
         return;
     }
 
-    switch(axis)
+    switch (axis)
     {
         case 0:
         {
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
                 {
-                    return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output, RedOpX_quantized<uint8_t>(), op);
+                    return Reducer<RedOpX_quantized<uint8_t>>::reduceX(window, input, output,
+                                                                       RedOpX_quantized<uint8_t>(), op);
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
-                    return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(), op);
+                    return Reducer<RedOpX_quantized<int8_t>>::reduceX(window, input, output, RedOpX_quantized<int8_t>(),
+                                                                      op);
                 }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
@@ -1635,19 +1704,22 @@
             }
         }
         case 1:
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
                 {
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceY(window, input, output,
+                                                                         RedOpYZW_quantized<uint8_t>(), op);
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceY(window, input, output,
+                                                                        RedOpYZW_quantized<int8_t>(), op);
                 }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(), op);
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceY(window, input, output, RedOpYZW<float16_t, 8>(),
+                                                                    op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceY(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1657,15 +1729,18 @@
                     ARM_COMPUTE_ERROR("Not supported");
             }
         case 2:
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceZ(window, input, output,
+                                                                         RedOpYZW_quantized<uint8_t>(), op);
                 case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceZ(window, input, output,
+                                                                        RedOpYZW_quantized<int8_t>(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(), op);
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceZ(window, input, output, RedOpYZW<float16_t, 8>(),
+                                                                    op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceZ(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1675,15 +1750,18 @@
                     ARM_COMPUTE_ERROR("Not supported");
             }
         case 3:
-            switch(input->info()->data_type())
+            switch (input->info()->data_type())
             {
                 case DataType::QASYMM8:
-                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output, RedOpYZW_quantized<uint8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<uint8_t>>::reduceW(window, input, output,
+                                                                         RedOpYZW_quantized<uint8_t>(), op);
                 case DataType::QASYMM8_SIGNED:
-                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output, RedOpYZW_quantized<int8_t>(), op);
+                    return Reducer<RedOpYZW_quantized<int8_t>>::reduceW(window, input, output,
+                                                                        RedOpYZW_quantized<int8_t>(), op);
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F16:
-                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(), op);
+                    return Reducer<RedOpYZW<float16_t, 8>>::reduceW(window, input, output, RedOpYZW<float16_t, 8>(),
+                                                                    op);
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
                 case DataType::F32:
                     return Reducer<RedOpYZW<float, 4>>::reduceW(window, input, output, RedOpYZW<float, 4>(), op);
@@ -1704,9 +1782,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
 
-    if(input->num_channels() == 1)
+    if (input->num_channels() == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::S32, DataType::F16, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                             DataType::S32, DataType::F16, DataType::F32);
     }
     else
     {
@@ -1715,13 +1794,14 @@
         ARM_COMPUTE_RETURN_ERROR_ON(axis != 2);
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         bool is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX || op == ReductionOperation::ARG_IDX_MIN);
-        if(!is_arg_min_max)
+        if (!is_arg_min_max)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
             ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != output->num_channels());
@@ -1731,8 +1811,9 @@
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32, DataType::S32);
         }
 
-        const TensorShape output_shape         = arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
-        const TensorInfo  tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
+        const TensorShape output_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis);
+        const TensorInfo tensor_info_reshaped = input->clone()->set_tensor_shape(output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_reshaped);
     }
 
@@ -1745,7 +1826,10 @@
 {
 }
 
-void NEReductionOperationKernel::configure(const ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op)
+void NEReductionOperationKernel::configure(const ITensor     *input,
+                                           ITensor           *output,
+                                           unsigned int       axis,
+                                           ReductionOperation op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -1761,14 +1845,23 @@
     INEKernel::configure(win);
 
     // Calculate output shape and set if empty
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
     // Output auto initialization if not yet initialized
     const bool is_arg_min_max   = (op == ReductionOperation::ARG_IDX_MIN || op == ReductionOperation::ARG_IDX_MAX);
     DataType   output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(output_data_type)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 }
 
-Status NEReductionOperationKernel::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op)
+Status NEReductionOperationKernel::validate(const ITensorInfo *input,
+                                            const ITensorInfo *output,
+                                            unsigned int       axis,
+                                            ReductionOperation op)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, op));
 

diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.h b/src/core/NEON/kernels/NEReductionOperationKernel.h
index 08e654f..78bec62 100644
--- a/src/core/NEON/kernels/NEReductionOperationKernel.h
+++ b/src/core/NEON/kernels/NEReductionOperationKernel.h

@@ -77,7 +77,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEReorderKernel.cpp b/src/core/NEON/kernels/NEReorderKernel.cpp
index 1a7f58b..f92a4c8 100644
--- a/src/core/NEON/kernels/NEReorderKernel.cpp
+++ b/src/core/NEON/kernels/NEReorderKernel.cpp

@@ -24,11 +24,13 @@
 #if defined(__aarch64__)
 
 #include "src/core/NEON/kernels/NEReorderKernel.h"
-#include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/arm_gemm/transform.hpp"
+
 namespace arm_compute
 {
 
@@ -37,29 +39,32 @@
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-    switch(_input->info()->data_type())
+    switch (_input->info()->data_type())
     {
         case DataType::F32:
         {
             const int ksize_rows_elements = _xmax * _ksize;
-            const int jump_rows = ksize_rows_elements * window.x().start();
-            const int k_start = window.x().start() * _ksize;
-            const int k_end = std::min(window.x().end() * _ksize, _kmax);
-            const int stride = _kmax;
-            if(k_start < k_end)
+            const int jump_rows           = ksize_rows_elements * window.x().start();
+            const int k_start             = window.x().start() * _ksize;
+            const int k_end               = std::min(window.x().end() * _ksize, _kmax);
+            const int stride              = _kmax;
+            if (k_start < k_end)
             {
-
-                switch(_output_wf)
+                switch (_output_wf)
                 {
                     case WeightFormat::OHWIo4:
                     {
-                        arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                        arm_gemm::Transform<4, 1, true, arm_gemm::VLType::None>(
+                            reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                            reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
                         break;
                     }
 #if defined(ARM_COMPUTE_ENABLE_SVE)
                     case WeightFormat::OHWIo8:
                     {
-                        arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(reinterpret_cast<float *>(_output->buffer()) + jump_rows, reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
+                        arm_gemm::Transform<1, 1, true, arm_gemm::VLType::SVE>(
+                            reinterpret_cast<float *>(_output->buffer()) + jump_rows,
+                            reinterpret_cast<float *>(_input->buffer()), stride, k_start, k_end, 0, _xmax);
                         break;
                     }
 #endif /* ARM_COMPUTE_ENABLE_SVE */
@@ -78,11 +83,20 @@
 }
 
 NEReorderKernel::NEReorderKernel()
-    : _input(nullptr), _output(nullptr), _ksize(0), _kmax(0), _xmax(0), _input_wf(WeightFormat::ANY), _output_wf(WeightFormat::ANY)
+    : _input(nullptr),
+      _output(nullptr),
+      _ksize(0),
+      _kmax(0),
+      _xmax(0),
+      _input_wf(WeightFormat::ANY),
+      _output_wf(WeightFormat::ANY)
 {
 }
 
-void NEReorderKernel::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+void NEReorderKernel::configure(const ITensor            *input,
+                                ITensor                  *output,
+                                arm_compute::WeightFormat input_wf,
+                                arm_compute::WeightFormat output_wf)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, input_wf, output_wf);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -96,7 +110,7 @@
 
     // Setting parameters for transform
     auto dims = input->info()->num_dimensions();
-    switch(dims)
+    switch (dims)
     {
         case 2:
         {
@@ -120,7 +134,7 @@
     // Window size is set by rows / _ksize
     Window win;
     int    window_size = 0;
-    switch(_output_wf)
+    switch (_output_wf)
     {
 #if defined(ARM_COMPUTE_ENABLE_SVE)
         case WeightFormat::OHWIo8:
@@ -142,7 +156,7 @@
             break;
         }
     }
-    if(_kmax % _ksize != 0)
+    if (_kmax % _ksize != 0)
     {
         window_size += 1;
     }
@@ -152,11 +166,14 @@
     INEKernel::configure(win);
 }
 
-Status NEReorderKernel::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+Status NEReorderKernel::validate(const ITensorInfo        *input,
+                                 const ITensorInfo        *output,
+                                 arm_compute::WeightFormat input_wf,
+                                 arm_compute::WeightFormat output_wf)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -167,20 +184,20 @@
         int  output_x_dim;
         int  output_k_dim;
         auto dims = output->num_dimensions();
-        switch(dims)
+        switch (dims)
         {
             case 2:
             {
-                input_x_dim = input->dimension(0); // Number of columns in input matrix
-                input_k_dim = input->dimension(1); // Number of rows in input matrix
+                input_x_dim  = input->dimension(0);  // Number of columns in input matrix
+                input_k_dim  = input->dimension(1);  // Number of rows in input matrix
                 output_x_dim = output->dimension(0); // Number of columns in output matrix
                 output_k_dim = output->dimension(1); // Number of rows in output matrix
                 break;
             }
             case 4:
             {
-                input_x_dim = input->dimension(2); // Number of columns in input matrix
-                input_k_dim = input->dimension(3); // Number of rows in input matrix
+                input_x_dim  = input->dimension(2);  // Number of columns in input matrix
+                input_k_dim  = input->dimension(3);  // Number of rows in input matrix
                 output_x_dim = output->dimension(2); // Number of columns in output matrix
                 output_k_dim = output->dimension(3); // Number of rows in output matrix
                 break;
@@ -192,7 +209,7 @@
         }
 
         int ksize;
-        switch(output_wf)
+        switch (output_wf)
         {
             case WeightFormat::OHWIo8:
             {
@@ -216,11 +233,10 @@
         ARM_COMPUTE_RETURN_ERROR_ON(rnd_up_input_kdim != output_k_dim);
         // output x_dim needs to be same as input
         ARM_COMPUTE_RETURN_ERROR_ON(input_x_dim != output_x_dim);
-
     }
     return Status{};
 }
 
 } // namespace arm_compute
 
-#endif  // defined(__aarch64__)
\ No newline at end of file
+#endif // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/NEReorderKernel.h b/src/core/NEON/kernels/NEReorderKernel.h
index 0790889..4528b25 100644
--- a/src/core/NEON/kernels/NEReorderKernel.h
+++ b/src/core/NEON/kernels/NEReorderKernel.h

@@ -26,9 +26,10 @@
 #ifndef ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
 #define ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL
 
-#include "src/core/NEON/INEKernel.h"
 #include "arm_compute/core/Types.h"
 
+#include "src/core/NEON/INEKernel.h"
+
 namespace arm_compute
 {
 
@@ -36,7 +37,6 @@
 class NEReorderKernel : public INEKernel
 {
 public:
-
     const char *name() const override
     {
         return "NEReorderKernel";
@@ -62,7 +62,10 @@
      * @param[in]  input_wf  WeightFormat of input.
      * @param[in]  output_wf WeightFormat of output.
      */
-    void configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    void configure(const ITensor            *input,
+                   ITensor                  *output,
+                   arm_compute::WeightFormat input_wf,
+                   arm_compute::WeightFormat output_wf);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEReorderKernel
      *
@@ -73,25 +76,27 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf);
+    static Status validate(const ITensorInfo        *input,
+                           const ITensorInfo        *output,
+                           arm_compute::WeightFormat input_wf,
+                           arm_compute::WeightFormat output_wf);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;
 
-
-/*****************************************************************************/
+    /*****************************************************************************/
 
 private:
-    const ITensor *_input{nullptr}; // Input tensor
-    ITensor *_output{nullptr}; // Output tensor
-    int32_t  _ksize{0}; // Blocking parameter, how many rows kernel reorders on each call
-    int32_t  _kmax{0}; // Rows in input tensor
-    int32_t  _xmax{0}; // Columns in input tensor
-    WeightFormat _input_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of input tensor
-    WeightFormat _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
+    const ITensor *_input{nullptr};  // Input tensor
+    ITensor       *_output{nullptr}; // Output tensor
+    int32_t        _ksize{0};        // Blocking parameter, how many rows kernel reorders on each call
+    int32_t        _kmax{0};         // Rows in input tensor
+    int32_t        _xmax{0};         // Columns in input tensor
+    WeightFormat   _input_wf{WeightFormat::UNSPECIFIED};  // WeightFormat of input tensor
+    WeightFormat   _output_wf{WeightFormat::UNSPECIFIED}; // WeightFormat of output tensor
 };
 
 } // namespace arm_compute
 #endif /* ACL_SRC_CORE_NEON_KERNELS_NEREORDERKERNEL */
 
-#endif  // defined(__aarch64__)
\ No newline at end of file
+#endif // defined(__aarch64__)

diff --git a/src/core/NEON/kernels/NEReorgLayerKernel.cpp b/src/core/NEON/kernels/NEReorgLayerKernel.cpp
index a7b830c..2275704 100644
--- a/src/core/NEON/kernels/NEReorgLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEReorgLayerKernel.cpp

@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -50,13 +51,16 @@
     const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
     ARM_COMPUTE_RETURN_ERROR_ON(stride <= 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0, "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0, "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_width] % stride) != 0,
+                                    "The width of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input->tensor_shape()[idx_height] % stride) != 0,
+                                    "The height of the input tensor must be a multiple of stride");
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
+        const TensorInfo tensor_info_output =
+            output->clone()->set_tensor_shape(misc::shape_calculator::compute_reorg_output_shape(*input, stride));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
@@ -65,8 +69,7 @@
 }
 } // namespace
 
-NEReorgLayerKernel::NEReorgLayerKernel()
-    : _input(nullptr), _output(nullptr), _stride(1)
+NEReorgLayerKernel::NEReorgLayerKernel() : _input(nullptr), _output(nullptr), _stride(1)
 {
 }
 
@@ -121,23 +124,26 @@
     Iterator out(_output, collapsed_window);
 
     // Perform reorg
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        // Get spatial coords and channels
-        const unsigned int w = id[idx_w];
-        const unsigned int h = id[idx_h];
-        const unsigned int c = id[idx_c];
+    execute_window_loop(
+        collapsed_window,
+        [&](const Coordinates &id)
+        {
+            // Get spatial coords and channels
+            const unsigned int w = id[idx_w];
+            const unsigned int h = id[idx_h];
+            const unsigned int c = id[idx_c];
 
-        // Calculate mapping
-        const unsigned int offset     = c / out_c;
-        Coordinates        map_coords = id;
-        map_coords.set(idx_w, w * stride + offset % stride);
-        map_coords.set(idx_h, h * stride + offset / stride);
-        map_coords.set(idx_c, c % out_c);
+            // Calculate mapping
+            const unsigned int offset     = c / out_c;
+            Coordinates        map_coords = id;
+            map_coords.set(idx_w, w * stride + offset % stride);
+            map_coords.set(idx_h, h * stride + offset / stride);
+            map_coords.set(idx_c, c % out_c);
 
-        // Perform mapping
-        std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords), _input->info()->element_size());
-    },
-    out);
+            // Perform mapping
+            std::memcpy(out.ptr(), in_ptr + _input->info()->offset_element_in_bytes(map_coords),
+                        _input->info()->element_size());
+        },
+        out);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEReverseKernel.cpp b/src/core/NEON/kernels/NEReverseKernel.cpp
index ca6c117..d2437ee 100644
--- a/src/core/NEON/kernels/NEReverseKernel.cpp
+++ b/src/core/NEON/kernels/NEReverseKernel.cpp

@@ -26,15 +26,17 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status
+validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
 {
     ARM_COMPUTE_UNUSED(use_inverted_axis);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, axis);
@@ -42,11 +44,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(axis, 1, DataType::U32, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->num_dimensions() > 1, "Axis must be a 1D tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4, "Current implementation only supports up to 4 dimensions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() > 4,
+                                    "Current implementation only supports up to 4 dimensions.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis->dimension(0) > 4, "Only up to 4 dimensions can be reversed");
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -57,8 +60,7 @@
 }
 } // namespace
 
-NEReverseKernel::NEReverseKernel()
-    : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
+NEReverseKernel::NEReverseKernel() : _input(nullptr), _output(nullptr), _axis(nullptr), _use_inverted_axis(false)
 {
 }
 
@@ -80,7 +82,10 @@
     INEKernel::configure(calculate_max_window(*output->info()));
 }
 
-Status NEReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status NEReverseKernel::validate(const ITensorInfo *input,
+                                 const ITensorInfo *output,
+                                 const ITensorInfo *axis,
+                                 bool               use_inverted_axis)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, axis, use_inverted_axis));
 
@@ -88,29 +93,30 @@
 }
 
 template <typename T>
-void run_reverse(const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
+void run_reverse(
+    const Window &window, const ITensor *input, const ITensor *axis, ITensor *output, bool use_inverted_axis)
 {
     unsigned int axis_bit = 0;
     const int    rank     = input->info()->num_dimensions();
 
-    for(unsigned int i = 0; i < axis->info()->dimension(0); ++i)
+    for (unsigned int i = 0; i < axis->info()->dimension(0); ++i)
     {
         int axis_i = *(reinterpret_cast<const int *>(axis->buffer()) + i);
 
         // The values of axis tensor must be between [-rank, rank-1].
-        if((axis_i < -rank) || (axis_i >= rank))
+        if ((axis_i < -rank) || (axis_i >= rank))
         {
             ARM_COMPUTE_ERROR("the valuses of the axis tensor must be within [-rank, rank-1].");
         }
 
         // In case of negative axis value i.e targeted axis(i) = rank + axis(i)
-        if(axis_i < 0)
+        if (axis_i < 0)
         {
             axis_i = rank + axis_i;
         }
 
         // Reverse ACL axis indices convention i.e. (inverted)axis = (tensor_rank - 1) - axis
-        if(use_inverted_axis)
+        if (use_inverted_axis)
         {
             axis_i = (rank - 1) - axis_i;
         }
@@ -127,43 +133,47 @@
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
-
-            // Reverse 0 axis
-            if(axis_bit & 0x1)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                in = wrapper::vrev64(in);
-                in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+                auto in = wrapper::vloadq(reinterpret_cast<T *>(input_it.ptr()) + x);
+
+                // Reverse 0 axis
+                if (axis_bit & 0x1)
+                {
+                    in = wrapper::vrev64(in);
+                    in = wrapper::vcombine(wrapper::vgethigh(in), wrapper::vgetlow(in));
+                }
+
+                const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
+                const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+                const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+                const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+
+                auto out_ptr =
+                    reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
+                wrapper::vstore(out_ptr, in);
             }
 
-            const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - window_step_x : x;
-            const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
-            const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
-            const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
 
-            auto out_ptr = reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w)));
-            wrapper::vstore(out_ptr, in);
-        }
+                const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
+                const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
+                const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
+                const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const auto in = *(reinterpret_cast<T *>(input_it.ptr()) + x);
-
-            const int offset_x = (axis_bit & 0x1) ? output->info()->dimension(0) - x - 1 : x;
-            const int offset_y = (axis_bit & 0x2) ? output->info()->dimension(1) - id.y() - 1 : id.y();
-            const int offset_z = (axis_bit & 0x4) ? output->info()->dimension(2) - id.z() - 1 : id.z();
-            const int offset_w = (axis_bit & 0x8) ? output->info()->dimension(3) - id[3] - 1 : id[3];
-
-            *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) = in;
-        }
-    },
-    input_it);
+                *reinterpret_cast<T *>(output->ptr_to_element(Coordinates(offset_x, offset_y, offset_z, offset_w))) =
+                    in;
+            }
+        },
+        input_it);
 }
 
 void NEReverseKernel::run(const Window &window, const ThreadInfo &info)
@@ -172,7 +182,7 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    switch(_input->info()->element_size())
+    switch (_input->info()->element_size())
     {
         case 4:
             run_reverse<uint32_t>(window, _input, _axis, _output, _use_inverted_axis);

diff --git a/src/core/NEON/kernels/NEReverseKernel.h b/src/core/NEON/kernels/NEReverseKernel.h
index 7d9ec46..9226188 100644
--- a/src/core/NEON/kernels/NEReverseKernel.h
+++ b/src/core/NEON/kernels/NEReverseKernel.h

@@ -68,7 +68,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
+    static Status
+    validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NESelectKernel.cpp b/src/core/NEON/kernels/NESelectKernel.cpp
index b8c9b24..7789b82 100644
--- a/src/core/NEON/kernels/NESelectKernel.cpp
+++ b/src/core/NEON/kernels/NESelectKernel.cpp

@@ -29,13 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include "src/core/common/Registrars.h"
-
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/select/list.h"
 
 #include <arm_neon.h>
@@ -54,7 +53,8 @@
 };
 
 using SelectorPtr = std::add_pointer<bool(const SelectKernelSelectorData &data)>::type;
-using KernelPtr   = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+using KernelPtr =
+    std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
 
 struct SelectKernelSelector
 {
@@ -63,95 +63,62 @@
     KernelPtr         ukernel;
 };
 
-static const SelectKernelSelector available_kernels[] =
-{
-    {
-        "neon_s8_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)
-    },
-    {
-        "neon_s16_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)
-    },
-    {
-        "neon_s32_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)
-    },
-    {
-        "neon_u8_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)
-    },
-    {
-        "neon_u16_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)
-    },
-    {
-        "neon_u32_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)
-    },
-    {
-        "neon_s8_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)
-    },
-    {
-        "neon_s16_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)
-    },
-    {
-        "neon_s32_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)
-    },
-    {
-        "neon_u8_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)
-    },
-    {
-        "neon_u16_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)
-    },
-    {
-        "neon_u32_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)
-    },
-    {
-        "neon_f16_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)
-    },
-    {
-        "neon_f16_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)
-    },
-    {
-        "neon_f32_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)
-    },
-    {
-        "neon_f32_not_same_rank",
-        [](const SelectKernelSelectorData & data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)
-    },
+static const SelectKernelSelector available_kernels[] = {
+    {"neon_s8_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_same_rank)},
+    {"neon_s16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_same_rank)},
+    {"neon_s32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_same_rank)},
+    {"neon_u8_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_same_rank)},
+    {"neon_u16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_same_rank)},
+    {"neon_u32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == true; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_same_rank)},
+    {"neon_s8_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S8 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s8_select_not_same_rank)},
+    {"neon_s16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S16 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s16_select_not_same_rank)},
+    {"neon_s32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::S32 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_s32_select_not_same_rank)},
+    {"neon_u8_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U8 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u8_select_not_same_rank)},
+    {"neon_u16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U16 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u16_select_not_same_rank)},
+    {"neon_u32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::U32 && data.is_same_rank == false; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::neon_u32_select_not_same_rank)},
+    {"neon_f16_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == true; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_same_rank)},
+    {"neon_f16_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F16 && data.is_same_rank == false; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_f16_select_not_same_rank)},
+    {"neon_f32_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == true; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_same_rank)},
+    {"neon_f32_not_same_rank",
+     [](const SelectKernelSelectorData &data) { return data.dt == DataType::F32 && data.is_same_rank == false; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_f32_select_not_same_rank)},
 };
 
 const SelectKernelSelector *get_implementation(const SelectKernelSelectorData &data)
 {
-    for(const auto &uk : available_kernels)
+    for (const auto &uk : available_kernels)
     {
-        if(uk.is_selected(data))
+        if (uk.is_selected(data))
         {
             return &uk;
         }
@@ -184,7 +151,8 @@
     INEKernel::configure(win);
 }
 
-Status NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
+Status
+NESelectKernel::validate(const ITensorInfo *c, const ITensorInfo *x, const ITensorInfo *y, const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(c, x, y);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(x);
@@ -195,9 +163,11 @@
 
     const bool is_same_rank = (c->tensor_shape().num_dimensions() == x->tensor_shape().num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(is_same_rank && (x->tensor_shape() != c->tensor_shape()));
-    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank && ((c->tensor_shape().num_dimensions() > 1) || (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
+    ARM_COMPUTE_RETURN_ERROR_ON(!is_same_rank &&
+                                ((c->tensor_shape().num_dimensions() > 1) ||
+                                 (c->tensor_shape().x() != x->tensor_shape()[x->tensor_shape().num_dimensions() - 1])));
 
-    if(output != nullptr && output->total_size() != 0)
+    if (output != nullptr && output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(x, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(x, output);
@@ -214,7 +184,7 @@
     ARM_COMPUTE_ERROR_ON(_output == nullptr);
     ARM_COMPUTE_ERROR_ON(_output->info() == nullptr);
 
-    const auto *uk = get_implementation(SelectKernelSelectorData{ _output->info()->data_type(), _has_same_rank });
+    const auto *uk = get_implementation(SelectKernelSelectorData{_output->info()->data_type(), _has_same_rank});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
     uk->ukernel(_c, _x, _y, _output, window);

diff --git a/src/core/NEON/kernels/NESelectKernel.h b/src/core/NEON/kernels/NESelectKernel.h
index e82105a..4fec42b 100644
--- a/src/core/NEON/kernels/NESelectKernel.h
+++ b/src/core/NEON/kernels/NESelectKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESELECTKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -82,7 +83,6 @@
     void run(const Window &window, const ThreadInfo &info) override;
 
 private:
-
     const ITensor *_c;             /**< Condition tensor */
     const ITensor *_x;             /**< Source tensor 1 */
     const ITensor *_y;             /**< Source tensor 2 */

diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
index 673eace..da023ae 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.cpp

@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -41,19 +42,22 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_info, const ITensorInfo *paddings, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *block_info,
+                          const ITensorInfo *paddings,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, paddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{ 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(block_info->tensor_shape(), TensorShape{2});
     ARM_COMPUTE_RETURN_ERROR_ON(paddings->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{ 2, 2 });
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(paddings->tensor_shape(), TensorShape{2, 2});
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
@@ -64,7 +68,11 @@
 
     return Status{};
 }
-Status validate_arguments_static(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status validate_arguments_static(const ITensorInfo *input,
+                                 const int          block_shape_x,
+                                 const int          block_shape_y,
+                                 const Size2D      &padding_left,
+                                 const Size2D      &padding_right,
                                  const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
@@ -73,9 +81,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape_x < 1 || block_shape_y < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(input, block_shape_x, block_shape_y, padding_left, padding_right);
+        TensorShape expected_output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+            input, block_shape_x, block_shape_y, padding_left, padding_right);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -86,14 +95,25 @@
 } // namespace
 
 NESpaceToBatchLayerKernel::NESpaceToBatchLayerKernel()
-    : _input(nullptr), _block_shape(nullptr), _paddings(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _padding_left(), _block_shape_x(), _block_shape_y()
+    : _input(nullptr),
+      _block_shape(nullptr),
+      _paddings(nullptr),
+      _output(nullptr),
+      _data_layout(DataLayout::UNKNOWN),
+      _padding_left(),
+      _block_shape_x(),
+      _block_shape_y()
 {
 }
 
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+                                          const ITensor *block_shape,
+                                          const ITensor *paddings,
+                                          ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input->info(), block_shape->info(), paddings->info(), output->info()));
 
     _input       = input;
     _block_shape = block_shape;
@@ -106,15 +126,22 @@
     ICPPKernel::configure(win);
 }
 
-void NESpaceToBatchLayerKernel::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
-                                          ITensor *output)
+void NESpaceToBatchLayerKernel::configure(const ITensor *input,
+                                          const int      block_shape_x,
+                                          const int      block_shape_y,
+                                          const Size2D  &padding_left,
+                                          const Size2D  &padding_right,
+                                          ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    TensorShape output_shape = misc::shape_calculator::compute_space_to_batch_shape(
+        input->info(), block_shape_x, block_shape_y, padding_left, padding_right);
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left, padding_right, output->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_static(input->info(), block_shape_x, block_shape_y, padding_left,
+                                                         padding_right, output->info()));
 
     _input         = input;
     _output        = output;
@@ -128,15 +155,23 @@
     INEKernel::configure(win);
 }
 
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const ITensorInfo *block_shape,
+                                           const ITensorInfo *paddings,
+                                           const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, block_shape, paddings, output));
     return Status{};
 }
-Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayerKernel::validate(const ITensorInfo *input,
+                                           const int          block_shape_x,
+                                           const int          block_shape_y,
+                                           const Size2D      &padding_left,
+                                           const Size2D      &padding_right,
                                            const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments_static(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
     return Status{};
 }
 
@@ -146,17 +181,17 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICPPKernel::window(), window);
 
-    if(_block_shape != nullptr)
+    if (_block_shape != nullptr)
     {
         // Retrieve the block shapes dynamically
         _block_shape_x = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(0)));
         _block_shape_y = *(reinterpret_cast<const int *>(_block_shape->ptr_to_element(1)));
     }
 
-    if(_paddings != nullptr)
+    if (_paddings != nullptr)
     {
-        const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 0, 0 }));
-        const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({ 1, 0 }));
+        const size_t pad_left_x = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({0, 0}));
+        const size_t pad_left_y = *reinterpret_cast<const size_t *>(_paddings->ptr_to_element({1, 0}));
         _padding_left           = Size2D(pad_left_x, pad_left_y);
     }
     const int height_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
@@ -173,57 +208,61 @@
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t out_x = id.x();
-                const size_t out_y = id.y();
-                const size_t z     = id.z();
-                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
-                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
-                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
                 {
-                    const int   w    = batch_id % batch_size;
-                    const int   in_x = pos_x - _padding_left.x();
-                    const int   in_y = pos_y - _padding_left.y();
-                    Coordinates input_coords{ in_x, in_y, z, w };
-                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                }
-            },
-            out);
+                    const size_t out_x = id.x();
+                    const size_t out_y = id.y();
+                    const size_t z     = id.z();
+                    const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                    const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                    if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+                        pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                    {
+                        const int   w    = batch_id % batch_size;
+                        const int   in_x = pos_x - _padding_left.x();
+                        const int   in_y = pos_y - _padding_left.y();
+                        Coordinates input_coords{in_x, in_y, z, w};
+                        memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                    }
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t out_x = id.y();
-                const size_t out_y = id.z();
-                const size_t z     = id.x();
-                const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
-                const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
-                if(pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height && pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
                 {
-                    const int   w    = batch_id % batch_size;
-                    const int   in_x = pos_x - _padding_left.x();
-                    const int   in_y = pos_y - _padding_left.y();
-                    Coordinates input_coords{ z, in_x, in_y, w };
-                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-                }
-            },
-            out);
+                    const size_t out_x = id.y();
+                    const size_t out_y = id.z();
+                    const size_t z     = id.x();
+                    const size_t pos_x = out_x * _block_shape_x + (batch_id / batch_size) % _block_shape_x;
+                    const size_t pos_y = out_y * _block_shape_y + (batch_id / batch_size) / _block_shape_x;
+                    if (pos_y >= _padding_left.y() && pos_y < _padding_left.y() + height &&
+                        pos_x >= _padding_left.x() && pos_x < _padding_left.x() + width)
+                    {
+                        const int   w    = batch_id % batch_size;
+                        const int   in_x = pos_x - _padding_left.x();
+                        const int   in_y = pos_y - _padding_left.y();
+                        Coordinates input_coords{z, in_x, in_y, w};
+                        memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                    }
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
index 44b8cbb..6292c07 100644
--- a/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToBatchLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESPACETOBATCHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -69,7 +70,12 @@
      * @param[in]  padding_right The padding at the end of every dimension of the output tensor.
      * @param[out] output        Tensor output. Data types supported: same as @p input
      */
-    void configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output);
+    void configure(const ITensor *input,
+                   const int      block_shape_x,
+                   const int      block_shape_y,
+                   const Size2D  &padding_left,
+                   const Size2D  &padding_right,
+                   ITensor       *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel
      *
      * @param[in] input       Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -79,7 +85,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *block_shape,
+                           const ITensorInfo *paddings,
+                           const ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration of @ref NESpaceToBatchLayerKernel (Static block shape and paddings)
      *
      * @param[in] input         Tensor input. Supported tensor rank: 4. Data types supported: All.
@@ -91,7 +100,12 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           const int          block_shape_x,
+                           const int          block_shape_y,
+                           const Size2D      &padding_left,
+                           const Size2D      &padding_right,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden:
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
index 7687c50..b49c5ee 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.cpp

@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 #include <cstdint>
@@ -50,7 +51,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(block_shape < 1);
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const DataLayout data_layout = input->data_layout();
         const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -115,43 +116,45 @@
     int batch_id = 0;
 
     // Main loop for NCHW and NHWC
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t channel_id = id.z();
-                const size_t in_x       = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
-                const size_t in_y       = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
-                const int    z          = channel_id % channel_size;
-                Coordinates  input_coords{ in_x, in_y, z, batch_id };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const size_t channel_id = id.z();
+                    const size_t in_x       = id.x() * _block_shape + (channel_id / channel_size) % _block_shape;
+                    const size_t in_y       = id.y() * _block_shape + (channel_id / channel_size) / _block_shape;
+                    const int    z          = channel_id % channel_size;
+                    Coordinates  input_coords{in_x, in_y, z, batch_id};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
     else
     {
         do
         {
             Iterator out(_output, slice_out);
-            execute_window_loop(slice_out, [&](const Coordinates & id)
-            {
-                const size_t channel_id = id.x();
-                const size_t in_x       = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
-                const size_t in_y       = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
-                const int    z          = channel_id % channel_size;
-                Coordinates  input_coords{ z, in_x, in_y, batch_id };
-                memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
-            },
-            out);
+            execute_window_loop(
+                slice_out,
+                [&](const Coordinates &id)
+                {
+                    const size_t channel_id = id.x();
+                    const size_t in_x       = id.y() * _block_shape + (channel_id / channel_size) % _block_shape;
+                    const size_t in_y       = id.z() * _block_shape + (channel_id / channel_size) / _block_shape;
+                    const int    z          = channel_id % channel_size;
+                    Coordinates  input_coords{z, in_x, in_y, batch_id};
+                    memcpy(out.ptr(), _input->ptr_to_element(input_coords), element_size);
+                },
+                out);
             ++batch_id;
-        }
-        while(window.slide_window_slice_3D(slice_out));
+        } while (window.slide_window_slice_3D(slice_out));
     }
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
index 953b68a..7d147c5b 100644
--- a/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h
+++ b/src/core/NEON/kernels/NESpaceToDepthLayerKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NESPACETODEPTHLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute

diff --git a/src/core/NEON/kernels/NEStackLayerKernel.cpp b/src/core/NEON/kernels/NEStackLayerKernel.cpp
index 93080e2..e23b40a 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEStackLayerKernel.cpp

@@ -25,13 +25,13 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -41,7 +41,11 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status validate_arguments(const ITensorInfo *input,
+                          unsigned int       axis,
+                          unsigned int       idx_input,
+                          unsigned int       num_tensors,
+                          const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
@@ -50,9 +54,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON(axis > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_stack_shape(*input, axis, num_tensors));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
+                                                           compute_stack_shape(*input, axis, num_tensors));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -60,7 +65,8 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, unsigned int axis, unsigned int num_tensors, ITensorInfo *output)
 {
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_stack_shape(*input, axis, num_tensors)));
@@ -71,11 +77,12 @@
     return std::make_pair(Status{}, win);
 }
 
-inline Coordinates shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
+inline Coordinates
+shift_from_axis_and_replace_coordinate(const Coordinates &id, unsigned int axis, unsigned int idx_input)
 {
     constexpr int max_out_coord = 5; // Input shape is max a 4D shape, output is max 5D
     Coordinates   id_out        = id;
-    for(unsigned int i = max_out_coord - 1; i > axis; --i)
+    for (unsigned int i = max_out_coord - 1; i > axis; --i)
     {
         id_out.set(i, id[i - 1]);
     }
@@ -84,12 +91,12 @@
 }
 } // namespace
 
-NEStackLayerKernel::NEStackLayerKernel()
-    : _input(nullptr), _output(nullptr), _axis(), _idx_input()
+NEStackLayerKernel::NEStackLayerKernel() : _input(nullptr), _output(nullptr), _axis(), _idx_input()
 {
 }
 
-void NEStackLayerKernel::configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
+void NEStackLayerKernel::configure(
+    const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), axis, idx_input, num_tensors, output->info()));
@@ -106,10 +113,15 @@
     INEKernel::configure(win_config.second);
 }
 
-Status NEStackLayerKernel::validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output)
+Status NEStackLayerKernel::validate(const ITensorInfo *input,
+                                    unsigned int       axis,
+                                    unsigned int       idx_input,
+                                    unsigned int       num_tensors,
+                                    const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, axis, idx_input, num_tensors, output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(input->clone().get(), axis, num_tensors, output->clone().get()).first);
     return Status{};
 }
 
@@ -131,12 +143,15 @@
     const int stride_w = _output->info()->num_dimensions() >= 3 ? _output->info()->strides_in_bytes()[3] : 0;
     const int stride_k = _output->info()->num_dimensions() >= 4 ? _output->info()->strides_in_bytes()[4] : 0;
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
-        const int   idx    = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w + id_out[4] * stride_k;
-        std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
-    },
-    input);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            Coordinates id_out = shift_from_axis_and_replace_coordinate(id, _axis, _idx_input);
+            const int idx = id_out[0] * stride_x + id_out[1] * stride_y + id_out[2] * stride_z + id_out[3] * stride_w +
+                            id_out[4] * stride_k;
+            std::memcpy(output.ptr() + idx, input.ptr(), _input->info()->element_size());
+        },
+        input);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEStackLayerKernel.h b/src/core/NEON/kernels/NEStackLayerKernel.h
index 9b36518..685812b 100644
--- a/src/core/NEON/kernels/NEStackLayerKernel.h
+++ b/src/core/NEON/kernels/NEStackLayerKernel.h

@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_NESTACKLAYERKERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -64,7 +65,8 @@
      * @param[out] output      Output tensor. Data types supported: Same as @p input.
      *
      */
-    void configure(const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
+    void configure(
+        const ITensor *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, ITensor *output);
     /**  Static function to check if given info will lead to a valid configuration of @ref NEStackLayerKernel
      *
      * @note Supported input tensor rank: up to 4
@@ -78,7 +80,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, unsigned int axis, unsigned int idx_input, unsigned int num_tensors, const ITensorInfo *output);
+    static Status validate(const ITensorInfo *input,
+                           unsigned int       axis,
+                           unsigned int       idx_input,
+                           unsigned int       num_tensors,
+                           const ITensorInfo *output);
 
     // Inherited methods overridden
     void run(const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.cpp b/src/core/NEON/kernels/NEStridedSliceKernel.cpp
index 2b406a8..efff51b 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.cpp
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.cpp

@@ -26,9 +26,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -38,9 +39,14 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output,
-                          const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                          int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *output,
+                          const Coordinates &starts,
+                          const Coordinates &ends,
+                          const BiStrides   &strides,
+                          int32_t            begin_mask,
+                          int32_t            end_mask,
+                          int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -49,19 +55,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON(starts.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(ends.num_dimensions() > input->num_dimensions());
     ARM_COMPUTE_RETURN_ERROR_ON(strides.num_dimensions() > input->num_dimensions());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i)
-    {
-        return i == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(strides.cbegin(), strides.cbegin() + strides.num_dimensions(), [](int i) { return i == 0; }));
 
     // Get expected output shape
-    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                          starts, ends, strides,
-                                                                                                          begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape exp_output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_RETURN_ERROR_ON(exp_output_shape.total_size() == 0);
 
     // Checks output if configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const TensorInfo exp_output_info = output->clone()->set_tensor_shape(exp_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &exp_output_info);
@@ -71,14 +74,18 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input, ITensorInfo *output,
-                                                        const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                                        int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *input,
+                                                        ITensorInfo       *output,
+                                                        const Coordinates &starts,
+                                                        const Coordinates &ends,
+                                                        const BiStrides   &strides,
+                                                        int32_t            begin_mask,
+                                                        int32_t            end_mask,
+                                                        int32_t            shrink_axis_mask)
 {
     // Output tensor auto initialization if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(*input,
-                                                                                                      starts, ends, strides,
-                                                                                                      begin_mask, end_mask, shrink_axis_mask);
+    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_strided_slice_shape(
+        *input, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
 
     // Create window
@@ -88,38 +95,49 @@
 }
 } // namespace
 
-NEStridedSliceKernel::NEStridedSliceKernel()
-    : _starts_abs(), _final_strides(), _shrink_mask()
+NEStridedSliceKernel::NEStridedSliceKernel() : _starts_abs(), _final_strides(), _shrink_mask()
 {
 }
 
-void NEStridedSliceKernel::configure(const ITensorInfo *input, ITensorInfo *output,
-                                     const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                     int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSliceKernel::configure(const ITensorInfo *input,
+                                     ITensorInfo       *output,
+                                     const Coordinates &starts,
+                                     const Coordinates &ends,
+                                     const BiStrides   &strides,
+                                     int32_t            begin_mask,
+                                     int32_t            end_mask,
+                                     int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
     _shrink_mask                   = shrink_axis_mask;
     const TensorShape &input_shape = input->tensor_shape();
     Coordinates        ends_abs;
-    std::tie(_starts_abs, ends_abs, _final_strides) = arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(
-                                                          input_shape,
-                                                          starts, ends, strides,
-                                                          begin_mask, end_mask, shrink_axis_mask);
+    std::tie(_starts_abs, ends_abs, _final_strides) =
+        arm_compute::helpers::tensor_transform::calculate_strided_slice_coords(input_shape, starts, ends, strides,
+                                                                               begin_mask, end_mask, shrink_axis_mask);
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    auto win_config =
+        validate_and_configure_window(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     INEKernel::configure(win_config.second);
 }
 
-Status NEStridedSliceKernel::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                      const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                      int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSliceKernel::validate(const ITensorInfo *input,
+                                      const ITensorInfo *output,
+                                      const Coordinates &starts,
+                                      const Coordinates &ends,
+                                      const BiStrides   &strides,
+                                      int32_t            begin_mask,
+                                      int32_t            end_mask,
+                                      int32_t            shrink_axis_mask)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(),
-                                                              starts, ends, strides, begin_mask, end_mask, shrink_axis_mask)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), starts, ends,
+                                                              strides, begin_mask, end_mask, shrink_axis_mask)
+                                    .first);
 
     return Status{};
 }
@@ -156,7 +174,7 @@
 
     size_t length_x = win.shape()[0];
 
-    if(_final_strides[0] == 1 && !is_shrink_x)
+    if (_final_strides[0] == 1 && !is_shrink_x)
     {
         win.set(Window::DimX, Window::Dimension(0, 1, 1));
         width_size = width_size * length_x;
@@ -183,16 +201,17 @@
     uint8_t *cur_ptr;
 
     execute_window_loop(
-        win, [&](const Coordinates & id)
-    {
-        cur_ptr = input_base;
-        cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
-        cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
-        cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
-        cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
+        win,
+        [&](const Coordinates &id)
+        {
+            cur_ptr = input_base;
+            cur_ptr += (start_0 + (id[idx_x] * shrinked_stride_0)) * byte_increment_0;
+            cur_ptr += (start_1 + (id[idx_y] * shrinked_stride_1)) * byte_increment_1;
+            cur_ptr += (start_2 + (id[idx_z] * shrinked_stride_2)) * byte_increment_2;
+            cur_ptr += (start_3 + (id[idx_w] * shrinked_stride_3)) * byte_increment_3;
 
-        std::copy_n(cur_ptr, width_size, output_it.ptr());
-    },
-    output_it);
+            std::copy_n(cur_ptr, width_size, output_it.ptr());
+        },
+        output_it);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/NEStridedSliceKernel.h b/src/core/NEON/kernels/NEStridedSliceKernel.h
index 9ce5174..a475f09 100644
--- a/src/core/NEON/kernels/NEStridedSliceKernel.h
+++ b/src/core/NEON/kernels/NEStridedSliceKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NE_STRIDED_SLICE_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 #include <cstdint>
@@ -68,9 +69,14 @@
      * @param[in]  shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                              A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    void configure(const ITensorInfo *input, ITensorInfo *output,
-                   const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                   int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    void configure(const ITensorInfo *input,
+                   ITensorInfo       *output,
+                   const Coordinates &starts,
+                   const Coordinates &ends,
+                   const BiStrides   &strides,
+                   int32_t            begin_mask,
+                   int32_t            end_mask,
+                   int32_t            shrink_axis_mask);
 
     /** Static function to check if given info will lead to a valid configuration of @ref NEStridedSliceKernel
      *
@@ -86,9 +92,14 @@
      * @param[in] shrink_axis_mask If the ith bit of shrink_axis_mask is set, it implies that the ith specification shrinks the dimensionality by 1.
      *                             A slice of size 1 starting from starts[i] in the dimension must be preserved.
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output,
-                           const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                           int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask);
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const Coordinates &starts,
+                           const Coordinates &ends,
+                           const BiStrides   &strides,
+                           int32_t            begin_mask,
+                           int32_t            end_mask,
+                           int32_t            shrink_axis_mask);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;

diff --git a/src/core/NEON/kernels/NETileKernel.cpp b/src/core/NEON/kernels/NETileKernel.cpp
index 94256dc..577ce5b 100644
--- a/src/core/NEON/kernels/NETileKernel.cpp
+++ b/src/core/NEON/kernels/NETileKernel.cpp

@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -43,15 +44,13 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.size() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(multiples.empty());
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e)
-    {
-        return e == 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(multiples.begin(), multiples.end(), [](uint32_t e) { return e == 0; }));
 
     // Validate output if initialized
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_tiled_shape(input->tensor_shape(), multiples), output->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -59,8 +58,7 @@
 }
 } // namespace
 
-NETileKernel::NETileKernel()
-    : _input(nullptr), _output(nullptr)
+NETileKernel::NETileKernel() : _input(nullptr), _output(nullptr)
 {
 }
 
@@ -95,8 +93,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
 
-    Window output_window{ window };
-    output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(), _input->info()->dimension(0)));
+    Window output_window{window};
+    output_window.set(Window::DimX, Window::Dimension(output_window.x().start(), output_window.x().end(),
+                                                      _input->info()->dimension(0)));
     Window out_slice = output_window.first_slice_window_1D();
 
     const auto src_shape = _input->info()->tensor_shape();
@@ -104,17 +103,19 @@
     {
         Iterator output_it(_output, out_slice);
 
-        execute_window_loop(out_slice, [&](const Coordinates & id)
-        {
-            const size_t x = id.x();
-            const size_t y = id.y();
-            const size_t z = id.z();
-            const size_t w = id[3];
-            Coordinates  input_coords{ x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3] };
-            memcpy(output_it.ptr(), _input->ptr_to_element(input_coords), _input->info()->dimension(0) * _input->info()->element_size());
-        },
-        output_it);
-    }
-    while(output_window.slide_window_slice_1D(out_slice));
+        execute_window_loop(
+            out_slice,
+            [&](const Coordinates &id)
+            {
+                const size_t x = id.x();
+                const size_t y = id.y();
+                const size_t z = id.z();
+                const size_t w = id[3];
+                Coordinates  input_coords{x % src_shape[0], y % src_shape[1], z % src_shape[2], w % src_shape[3]};
+                memcpy(output_it.ptr(), _input->ptr_to_element(input_coords),
+                       _input->info()->dimension(0) * _input->info()->element_size());
+            },
+            output_it);
+    } while (output_window.slide_window_slice_1D(out_slice));
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/assembly/depthwise.hpp b/src/core/NEON/kernels/assembly/depthwise.hpp
index dbd47cc..13c2d31 100644
--- a/src/core/NEON/kernels/assembly/depthwise.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise.hpp

@@ -38,9 +38,8 @@
     DepthwiseMethod method = DepthwiseMethod::DEFAULT;
     std::string     filter = "";
 
-    DepthwiseConfig(DepthwiseMethod method)
-        : method(method) {};
-    DepthwiseConfig() {};
+    DepthwiseConfig(DepthwiseMethod method) : method(method){};
+    DepthwiseConfig(){};
 };
 
 struct DepthwiseArgs
@@ -63,18 +62,24 @@
 
     bool fast_mode = false;
 
-    DepthwiseArgs(
-        const CPUInfo *cpu_info,
-        unsigned int kernel_rows, unsigned int kernel_cols,
-        unsigned int stride_rows, unsigned int stride_cols,
-        unsigned int dilation_rows, unsigned int dilation_cols,
-        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
-        unsigned int input_channels,
-        unsigned int output_rows, unsigned int output_cols,
-        unsigned int  channel_multiplier,
-        PaddingValues padding, arm_gemm::Activation activation,
+    DepthwiseArgs(const CPUInfo       *cpu_info,
+                  unsigned int         kernel_rows,
+                  unsigned int         kernel_cols,
+                  unsigned int         stride_rows,
+                  unsigned int         stride_cols,
+                  unsigned int         dilation_rows,
+                  unsigned int         dilation_cols,
+                  unsigned int         n_batches,
+                  unsigned int         input_rows,
+                  unsigned int         input_cols,
+                  unsigned int         input_channels,
+                  unsigned int         output_rows,
+                  unsigned int         output_cols,
+                  unsigned int         channel_multiplier,
+                  PaddingValues        padding,
+                  arm_gemm::Activation activation,
 
-        const DepthwiseConfig *config)
+                  const DepthwiseConfig *config)
         : cpu_info(cpu_info),
           kernel_rows(kernel_rows),
           kernel_cols(kernel_cols),
@@ -95,20 +100,38 @@
     {
     }
 
-    DepthwiseArgs(
-        const CPUInfo *cpu_info,
-        unsigned int kernel_rows, unsigned int kernel_cols,
-        unsigned int stride_rows, unsigned int stride_cols,
-        unsigned int n_batches, unsigned int input_rows, unsigned int input_cols,
-        unsigned int input_channels,
-        unsigned int output_rows, unsigned int output_cols,
-        unsigned int  channel_multiplier,
-        PaddingValues padding, arm_gemm::Activation activation,
-        const DepthwiseConfig *config)
-        : DepthwiseArgs(cpu_info, kernel_rows, kernel_cols, stride_rows,
-                        stride_cols, 1, 1, n_batches, input_rows, input_cols,
-                        input_channels, output_rows, output_cols,
-                        channel_multiplier, padding, activation, config)
+    DepthwiseArgs(const CPUInfo         *cpu_info,
+                  unsigned int           kernel_rows,
+                  unsigned int           kernel_cols,
+                  unsigned int           stride_rows,
+                  unsigned int           stride_cols,
+                  unsigned int           n_batches,
+                  unsigned int           input_rows,
+                  unsigned int           input_cols,
+                  unsigned int           input_channels,
+                  unsigned int           output_rows,
+                  unsigned int           output_cols,
+                  unsigned int           channel_multiplier,
+                  PaddingValues          padding,
+                  arm_gemm::Activation   activation,
+                  const DepthwiseConfig *config)
+        : DepthwiseArgs(cpu_info,
+                        kernel_rows,
+                        kernel_cols,
+                        stride_rows,
+                        stride_cols,
+                        1,
+                        1,
+                        n_batches,
+                        input_rows,
+                        input_cols,
+                        input_channels,
+                        output_rows,
+                        output_cols,
+                        channel_multiplier,
+                        padding,
+                        activation,
+                        config)
     {
     }
 };
@@ -127,17 +150,18 @@
     {
     }
 
-    Tile()
-        : Tile(nullptr, 0, 0, 0)
+    Tile() : Tile(nullptr, 0, 0, 0)
     {
     }
 
-    void load_from(
-        const TInput      *input,
-        const unsigned int ld_row, const unsigned int ld_col,
-        const unsigned int n_rows, const unsigned int n_cols,
-        const int input_i, const int input_j,
-        const unsigned int channel_multiplier) const
+    void load_from(const TInput      *input,
+                   const unsigned int ld_row,
+                   const unsigned int ld_col,
+                   const unsigned int n_rows,
+                   const unsigned int n_cols,
+                   const int          input_i,
+                   const int          input_j,
+                   const unsigned int channel_multiplier) const
     {
         const auto pad_top  = input_i < 0 ? -input_i : 0;
         const auto pad_left = input_j < 0 ? -input_j : 0;
@@ -145,18 +169,15 @@
         const auto padded_rows = std::min(n_rows - input_i, tile_rows) - pad_top;
         const auto padded_cols = std::min(n_cols - input_j, tile_cols) - pad_left;
 
-        if(padded_rows < tile_rows || padded_cols < tile_cols)
+        if (padded_rows < tile_rows || padded_cols < tile_cols)
         {
             memset(array, 0, tile_rows * tile_cols * tile_channels * sizeof(TInput));
         }
 
-        do_premultiply<TInput>(
-            (TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col,
-            ld_row, ld_col,
-            array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
-            tile_cols * tile_channels, tile_channels,
-            padded_rows, padded_cols, tile_channels / channel_multiplier,
-            channel_multiplier);
+        do_premultiply<TInput>((TInput *)input + std::max(input_i, 0) * ld_row + std::max(input_j, 0) * ld_col, ld_row,
+                               ld_col, array + pad_top * tile_cols * tile_channels + pad_left * tile_channels,
+                               tile_cols * tile_channels, tile_channels, padded_rows, padded_cols,
+                               tile_channels / channel_multiplier, channel_multiplier);
     }
 };
 
@@ -168,9 +189,8 @@
     std::string         m_name{};
 
 public:
-    DepthwiseCommon(const DepthwiseArgs &args)
-        : m_args(args) {};
-    DepthwiseCommon(DepthwiseCommon &) = delete;
+    DepthwiseCommon(const DepthwiseArgs &args) : m_args(args){};
+    DepthwiseCommon(DepthwiseCommon &)            = delete;
     DepthwiseCommon &operator=(DepthwiseCommon &) = delete;
 
     std::string name() const override
@@ -181,19 +201,18 @@
     void set_name(std::string name)
     {
         // Only allow the name to be set once
-        if(m_name.empty())
+        if (m_name.empty())
         {
             m_name = name;
         }
     }
 
-    void execute(
-        const void *const  input,
-        const void *const  parameters,
-        void *const        output,
-        void *const        working_space,
-        const unsigned int thread_id,
-        const unsigned int n_threads) const override final
+    void execute(const void *const  input,
+                 const void *const  parameters,
+                 void *const        output,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
     {
         const size_t ld_input_col    = m_args.input_channels;
         const size_t ld_input_row    = ld_input_col * m_args.input_cols;
@@ -202,56 +221,47 @@
         const size_t ld_output_row   = ld_output_col * m_args.output_cols;
         const size_t ld_output_batch = ld_output_row * m_args.output_rows;
 
-        execute(
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            parameters, output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, n_threads);
+        execute(input, ld_input_col, ld_input_row, ld_input_batch, parameters, output, ld_output_col, ld_output_row,
+                ld_output_batch, working_space, thread_id, n_threads);
     }
 
-    void execute(
-        const void *const  input,
-        size_t             ld_input_col,
-        size_t             ld_input_row,
-        size_t             ld_input_batch,
-        const void *const  parameters,
-        void *const        output,
-        size_t             ld_output_col,
-        size_t             ld_output_row,
-        size_t             ld_output_batch,
-        void *const        working_space,
-        const unsigned int thread_id,
-        const unsigned int n_threads) const override final
+    void execute(const void *const  input,
+                 size_t             ld_input_col,
+                 size_t             ld_input_row,
+                 size_t             ld_input_batch,
+                 const void *const  parameters,
+                 void *const        output,
+                 size_t             ld_output_col,
+                 size_t             ld_output_row,
+                 size_t             ld_output_batch,
+                 void *const        working_space,
+                 const unsigned int thread_id,
+                 const unsigned int n_threads) const override final
     {
-        execute(
-            m_args.n_batches, m_args.input_rows, m_args.input_cols,
-            m_args.input_channels, m_args.padding,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            parameters,
-            m_args.output_rows, m_args.output_cols,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, n_threads);
+        execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.input_channels, m_args.padding, input,
+                ld_input_col, ld_input_row, ld_input_batch, parameters, m_args.output_rows, m_args.output_cols, output,
+                ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, n_threads);
     }
 
-    void execute(
-        unsigned int         batches,
-        unsigned int         input_height,
-        unsigned int         input_width,
-        unsigned int         channels,
-        const PaddingValues &padding,
-        const void          *input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const void          *parameters,
-        unsigned int         output_height,
-        unsigned int         output_width,
-        void                *output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         n_threads) const override final
+    void execute(unsigned int         batches,
+                 unsigned int         input_height,
+                 unsigned int         input_width,
+                 unsigned int         channels,
+                 const PaddingValues &padding,
+                 const void          *input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const void          *parameters,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void                *output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         n_threads) const override final
     {
         // Construct a new set of arguments to reflect that we might have been
         // passed different input/output tensors. Dilation is handled at this
@@ -271,38 +281,33 @@
         auto ld_output_col_d = ld_output_col * m_args.dilation_cols;
         auto ld_output_row_d = ld_output_row * m_args.dilation_rows;
 
-        for(size_t drow = 0; drow < m_args.dilation_rows; drow++)
+        for (size_t drow = 0; drow < m_args.dilation_rows; drow++)
         {
             size_t start_i;
-            std::tie(args.output_rows, args.input_rows, start_i,
-                     args.padding.top, args.padding.bottom) =
-                         get_reduced_view_for_dilation(
-                             output_height, input_height, drow, m_args.dilation_rows,
-                             m_args.kernel_rows, m_args.stride_rows, padding.top);
+            std::tie(args.output_rows, args.input_rows, start_i, args.padding.top, args.padding.bottom) =
+                get_reduced_view_for_dilation(output_height, input_height, drow, m_args.dilation_rows,
+                                              m_args.kernel_rows, m_args.stride_rows, padding.top);
 
             auto input_row  = static_cast<const TInput *>(input) + start_i * ld_input_row;
             auto output_row = static_cast<TOutput *>(output) + drow * ld_output_row;
 
-            if(args.output_rows)
+            if (args.output_rows)
             {
-                for(size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
+                for (size_t dcol = 0; dcol < m_args.dilation_cols; dcol++)
                 {
                     size_t start_j;
-                    std::tie(args.output_cols, args.input_cols, start_j,
-                             args.padding.left, args.padding.right) =
-                                 get_reduced_view_for_dilation(
-                                     output_width, input_width, dcol, m_args.dilation_cols,
-                                     m_args.kernel_cols, m_args.stride_cols, padding.left);
+                    std::tie(args.output_cols, args.input_cols, start_j, args.padding.left, args.padding.right) =
+                        get_reduced_view_for_dilation(output_width, input_width, dcol, m_args.dilation_cols,
+                                                      m_args.kernel_cols, m_args.stride_cols, padding.left);
 
                     const TInput *input_col  = input_row + start_j * ld_input_col;
                     TOutput      *output_col = output_row + dcol * ld_output_col;
 
-                    if(args.output_cols)
+                    if (args.output_cols)
                     {
-                        this->execute_internal(
-                            args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch, parameters,
-                            output_col, ld_output_col_d, ld_output_row_d, ld_output_batch,
-                            working_space, thread_id, n_threads);
+                        this->execute_internal(args, input_col, ld_input_col_d, ld_input_row_d, ld_input_batch,
+                                               parameters, output_col, ld_output_col_d, ld_output_row_d,
+                                               ld_output_batch, working_space, thread_id, n_threads);
                     }
                 }
             }
@@ -310,20 +315,19 @@
     }
 
 protected:
-    virtual void execute_internal(
-        const DepthwiseArgs &instance_args,
-        const void          *input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const void          *parameters,
-        void                *output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         n_threads) const = 0;
+    virtual void execute_internal(const DepthwiseArgs &instance_args,
+                                  const void          *input,
+                                  size_t               ld_input_col,
+                                  size_t               ld_input_row,
+                                  size_t               ld_input_batch,
+                                  const void          *parameters,
+                                  void                *output,
+                                  size_t               ld_output_col,
+                                  size_t               ld_output_row,
+                                  size_t               ld_output_batch,
+                                  void                *working_space,
+                                  unsigned int         thread_id,
+                                  unsigned int         n_threads) const = 0;
 
     virtual bool uses_premultiply() const
     {

diff --git a/src/core/NEON/kernels/assembly/depthwise_common.hpp b/src/core/NEON/kernels/assembly/depthwise_common.hpp
index a5db793..5ff848e 100644
--- a/src/core/NEON/kernels/assembly/depthwise_common.hpp
+++ b/src/core/NEON/kernels/assembly/depthwise_common.hpp

@@ -49,11 +49,7 @@
     bool            is_default     = false;
     uint64_t        cycle_estimate = 0;
 
-    KernelDescription(
-        DepthwiseMethod method,
-        std::string     name,
-        bool            is_default,
-        uint64_t        cycle_estimate)
+    KernelDescription(DepthwiseMethod method, std::string name, bool is_default, uint64_t cycle_estimate)
         : method(method), name(name), is_default(is_default), cycle_estimate(cycle_estimate)
     {
     }
@@ -78,58 +74,51 @@
     // pointer the bias vector (which may be nullptr in the case of no bias) and
     // a pointer to the array of weights (stored in HWIO order).
     virtual void pack_parameters(
-        void       *buffer,
-        const void *biases,
-        const void *weights,
-        size_t      ld_weight_col = 0,
-        size_t      ld_weight_row = 0) = 0;
+        void *buffer, const void *biases, const void *weights, size_t ld_weight_col = 0, size_t ld_weight_row = 0) = 0;
 
     // Determine the amount of working space required
     virtual size_t get_working_size(unsigned int n_threads) const = 0;
 
     // Execute the convolution over the specified area of memory.
-    virtual void execute(
-        const void *input,       // Pointer to input tensor
-        const void *parameters,  // Packed parameters buffer
-        void        *output,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
+    virtual void execute(const void  *input,      // Pointer to input tensor
+                         const void  *parameters, // Packed parameters buffer
+                         void        *output,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
 
-    virtual void execute(
-        const void *input,
-        size_t       ld_input_col,
-        size_t       ld_input_row,
-        size_t       ld_input_batch,
-        const void *parameters,
-        void        *output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
+    virtual void execute(const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
 
-    virtual void execute(
-        unsigned int batches,
-        unsigned int input_height,
-        unsigned int input_width,
-        unsigned int channels,
-        const PaddingValues &,
-        const void *input,
-        size_t       ld_input_col,
-        size_t       ld_input_row,
-        size_t       ld_input_batch,
-        const void *parameters,
-        unsigned int output_height,
-        unsigned int output_width,
-        void        *output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int n_threads) const = 0;
+    virtual void execute(unsigned int batches,
+                         unsigned int input_height,
+                         unsigned int input_width,
+                         unsigned int channels,
+                         const PaddingValues &,
+                         const void  *input,
+                         size_t       ld_input_col,
+                         size_t       ld_input_row,
+                         size_t       ld_input_batch,
+                         const void  *parameters,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void        *output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int n_threads) const = 0;
 };
 
 // To handle a dilation factor of D execute the kernel once for each d in
@@ -145,12 +134,13 @@
 // - Number of valid input pixels corresponding to `d`
 // - Offset of the first pixel corresponding to `d`
 // - Amount of padding in the view for `d`
-std::tuple<size_t, size_t, size_t, size_t, size_t>
-get_reduced_view_for_dilation(
-    size_t out_size, size_t in_size,
-    size_t d, size_t dilation_factor,
-    size_t kernel_size, size_t stride,
-    size_t pad_before);
+std::tuple<size_t, size_t, size_t, size_t, size_t> get_reduced_view_for_dilation(size_t out_size,
+                                                                                 size_t in_size,
+                                                                                 size_t d,
+                                                                                 size_t dilation_factor,
+                                                                                 size_t kernel_size,
+                                                                                 size_t stride,
+                                                                                 size_t pad_before);
 
 } // namespace depthwise
 } // namespace arm_conv

diff --git a/src/core/NEON/kernels/assembly/pool_common.hpp b/src/core/NEON/kernels/assembly/pool_common.hpp
index f1f70cf..045f9f9 100644
--- a/src/core/NEON/kernels/assembly/pool_common.hpp
+++ b/src/core/NEON/kernels/assembly/pool_common.hpp

@@ -68,45 +68,42 @@
     virtual size_t get_working_size(unsigned int num_threads) const = 0;
 
     // Execute pooling over the specified area of memory.
-    virtual void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         void *const       output,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute(const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         void *const       output,
+                         size_t            ld_output_col,
+                         size_t            ld_output_row,
+                         size_t            ld_output_batch,
+                         void             *working_space,
+                         unsigned int      thread_id,
+                         unsigned int      num_threads) const = 0;
 
-    virtual void execute(
-        unsigned int      batches,
-        unsigned int      height,
-        unsigned int      width,
-        unsigned int      channels,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        const PaddingValues &,
-        unsigned int output_height,
-        unsigned int output_width,
-        void *const  output,
-        size_t       ld_output_col,
-        size_t       ld_output_row,
-        size_t       ld_output_batch,
-        void        *working_space,
-        unsigned int thread_id,
-        unsigned int num_threads) const = 0;
+    virtual void execute(unsigned int      batches,
+                         unsigned int      height,
+                         unsigned int      width,
+                         unsigned int      channels,
+                         const void *const input,
+                         size_t            ld_input_col,
+                         size_t            ld_input_row,
+                         size_t            ld_input_batch,
+                         const PaddingValues &,
+                         unsigned int output_height,
+                         unsigned int output_width,
+                         void *const  output,
+                         size_t       ld_output_col,
+                         size_t       ld_output_row,
+                         size_t       ld_output_batch,
+                         void        *working_space,
+                         unsigned int thread_id,
+                         unsigned int num_threads) const = 0;
 };
 
 } // namespace pooling

diff --git a/src/core/NEON/kernels/assembly/pooling.hpp b/src/core/NEON/kernels/assembly/pooling.hpp
index e8db35c..89d5942 100644
--- a/src/core/NEON/kernels/assembly/pooling.hpp
+++ b/src/core/NEON/kernels/assembly/pooling.hpp

@@ -36,9 +36,8 @@
     PoolingMethod method = PoolingMethod::DEFAULT;
     std::string   filter = "";
 
-    PoolingConfig(PoolingMethod method)
-        : method(method) {};
-    PoolingConfig() {};
+    PoolingConfig(PoolingMethod method) : method(method){};
+    PoolingConfig(){};
 };
 
 struct PoolingArgs
@@ -57,30 +56,40 @@
 
     const PoolingConfig *config;
 
-    PoolingArgs(
-        const CPUInfo       *cpu_info,
-        PoolingType          pool_type,
-        const PoolingWindow &window,
-        const PoolingStride &stride,
-        bool                 exclude_padding,
-        unsigned int         n_batches,
-        unsigned int         input_rows,
-        unsigned int         input_cols,
-        unsigned int         n_channels,
-        unsigned int         output_rows,
-        unsigned int         output_cols,
-        const PaddingValues &padding,
-        const PoolingConfig *cfg)
-        : cpu_info(cpu_info), pool_type(pool_type), pool_window(window), pool_stride(stride), exclude_padding(exclude_padding), n_batches(n_batches), input_rows(input_rows), input_cols(input_cols),
-          n_channels(n_channels), output_rows(output_rows), output_cols(output_cols), padding(padding), config(cfg)
+    PoolingArgs(const CPUInfo       *cpu_info,
+                PoolingType          pool_type,
+                const PoolingWindow &window,
+                const PoolingStride &stride,
+                bool                 exclude_padding,
+                unsigned int         n_batches,
+                unsigned int         input_rows,
+                unsigned int         input_cols,
+                unsigned int         n_channels,
+                unsigned int         output_rows,
+                unsigned int         output_cols,
+                const PaddingValues &padding,
+                const PoolingConfig *cfg)
+        : cpu_info(cpu_info),
+          pool_type(pool_type),
+          pool_window(window),
+          pool_stride(stride),
+          exclude_padding(exclude_padding),
+          n_batches(n_batches),
+          input_rows(input_rows),
+          input_cols(input_cols),
+          n_channels(n_channels),
+          output_rows(output_rows),
+          output_cols(output_cols),
+          padding(padding),
+          config(cfg)
     {
         // If either of the pooling window dimensions are set to zero, meaning
         // "pool everything", then replace with the corresponding input dimension.
-        if(pool_window.rows == 0)
+        if (pool_window.rows == 0)
         {
             pool_window.rows = input_rows;
         }
-        if(pool_window.cols == 0)
+        if (pool_window.cols == 0)
         {
             pool_window.cols = input_cols;
         }
@@ -100,10 +109,16 @@
     int32_t per_layer_right_shift = 0;
     int32_t per_layer_mul         = 0;
 
-    Requantize32(int32_t input_offset, int32_t output_offset,
-                 int32_t per_layer_left_shift, int32_t per_layer_right_shift,
+    Requantize32(int32_t input_offset,
+                 int32_t output_offset,
+                 int32_t per_layer_left_shift,
+                 int32_t per_layer_right_shift,
                  int32_t per_layer_mul)
-        : input_offset(input_offset), output_offset(output_offset), per_layer_left_shift(per_layer_left_shift), per_layer_right_shift(per_layer_right_shift), per_layer_mul(per_layer_mul)
+        : input_offset(input_offset),
+          output_offset(output_offset),
+          per_layer_left_shift(per_layer_left_shift),
+          per_layer_right_shift(per_layer_right_shift),
+          per_layer_mul(per_layer_mul)
     {
     }
 };
@@ -115,105 +130,88 @@
     const PoolingArgs m_args;
 
 public:
-    PoolingCommon(const PoolingArgs &args)
-        : m_args(args)
+    PoolingCommon(const PoolingArgs &args) : m_args(args)
     {
     }
-    PoolingCommon(PoolingCommon &) = delete;
+    PoolingCommon(PoolingCommon &)            = delete;
     PoolingCommon &operator=(PoolingCommon &) = delete;
 
     size_t get_working_size(unsigned int) const override = 0;
 
     // Execute pooling over the specified area of memory.
-    void execute(
-        const void *const input,
-        void *const       output,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const override
+    void execute(const void *const input,
+                 void *const       output,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
     {
-        this->execute(
-            input,
-            m_args.n_channels,
-            m_args.n_channels * m_args.input_cols,
-            m_args.n_channels * m_args.input_cols * m_args.input_rows,
-            output,
-            m_args.n_channels,
-            m_args.n_channels * m_args.output_cols,
-            m_args.n_channels * m_args.output_cols * m_args.output_rows,
-            working_space,
-            thread_id, num_threads);
+        this->execute(input, m_args.n_channels, m_args.n_channels * m_args.input_cols,
+                      m_args.n_channels * m_args.input_cols * m_args.input_rows, output, m_args.n_channels,
+                      m_args.n_channels * m_args.output_cols,
+                      m_args.n_channels * m_args.output_cols * m_args.output_rows, working_space, thread_id,
+                      num_threads);
     }
 
-    void execute(
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const override
+    void execute(const void *const input,
+                 size_t            ld_input_col,
+                 size_t            ld_input_row,
+                 size_t            ld_input_batch,
+                 void *const       output,
+                 size_t            ld_output_col,
+                 size_t            ld_output_row,
+                 size_t            ld_output_batch,
+                 void             *working_space,
+                 unsigned int      thread_id,
+                 unsigned int      num_threads) const override
     {
-        this->execute(
-            m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            m_args.padding, m_args.output_rows, m_args.output_cols,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, num_threads);
+        this->execute(m_args.n_batches, m_args.input_rows, m_args.input_cols, m_args.n_channels, input, ld_input_col,
+                      ld_input_row, ld_input_batch, m_args.padding, m_args.output_rows, m_args.output_cols, output,
+                      ld_output_col, ld_output_row, ld_output_batch, working_space, thread_id, num_threads);
     }
 
-    void execute(
-        unsigned int         batches,
-        unsigned int         height,
-        unsigned int         width,
-        unsigned int         channels,
-        const void *const    input,
-        size_t               ld_input_col,
-        size_t               ld_input_row,
-        size_t               ld_input_batch,
-        const PaddingValues &padding,
-        unsigned int         output_height,
-        unsigned int         output_width,
-        void *const          output,
-        size_t               ld_output_col,
-        size_t               ld_output_row,
-        size_t               ld_output_batch,
-        void                *working_space,
-        unsigned int         thread_id,
-        unsigned int         num_threads) const override
+    void execute(unsigned int         batches,
+                 unsigned int         height,
+                 unsigned int         width,
+                 unsigned int         channels,
+                 const void *const    input,
+                 size_t               ld_input_col,
+                 size_t               ld_input_row,
+                 size_t               ld_input_batch,
+                 const PaddingValues &padding,
+                 unsigned int         output_height,
+                 unsigned int         output_width,
+                 void *const          output,
+                 size_t               ld_output_col,
+                 size_t               ld_output_row,
+                 size_t               ld_output_batch,
+                 void                *working_space,
+                 unsigned int         thread_id,
+                 unsigned int         num_threads) const override
     {
-        this->execute_internal(
-            batches, height, width, channels, padding,
-            input, ld_input_col, ld_input_row, ld_input_batch,
-            output_height, output_width,
-            output, ld_output_col, ld_output_row, ld_output_batch,
-            working_space, thread_id, num_threads);
+        this->execute_internal(batches, height, width, channels, padding, input, ld_input_col, ld_input_row,
+                               ld_input_batch, output_height, output_width, output, ld_output_col, ld_output_row,
+                               ld_output_batch, working_space, thread_id, num_threads);
     }
 
 protected:
-    virtual void execute_internal(
-        unsigned int batches,
-        unsigned int height,
-        unsigned int width,
-        unsigned int channels,
-        const PaddingValues &,
-        const void *const input,
-        size_t            ld_input_col,
-        size_t            ld_input_row,
-        size_t            ld_input_batch,
-        unsigned int      output_height,
-        unsigned int      output_width,
-        void *const       output,
-        size_t            ld_output_col,
-        size_t            ld_output_row,
-        size_t            ld_output_batch,
-        void             *working_space,
-        unsigned int      thread_id,
-        unsigned int      num_threads) const = 0;
+    virtual void execute_internal(unsigned int batches,
+                                  unsigned int height,
+                                  unsigned int width,
+                                  unsigned int channels,
+                                  const PaddingValues &,
+                                  const void *const input,
+                                  size_t            ld_input_col,
+                                  size_t            ld_input_row,
+                                  size_t            ld_input_batch,
+                                  unsigned int      output_height,
+                                  unsigned int      output_width,
+                                  void *const       output,
+                                  size_t            ld_output_col,
+                                  size_t            ld_output_row,
+                                  size_t            ld_output_batch,
+                                  void             *working_space,
+                                  unsigned int      thread_id,
+                                  unsigned int      num_threads) const = 0;
 };
 
 template <typename TInput, typename TOutput>

diff --git a/src/core/NEON/kernels/assembly/premultiply.hpp b/src/core/NEON/kernels/assembly/premultiply.hpp
index 16f26de..fb97cf8 100644
--- a/src/core/NEON/kernels/assembly/premultiply.hpp
+++ b/src/core/NEON/kernels/assembly/premultiply.hpp

@@ -44,30 +44,27 @@
                     const unsigned     input_channels,
                     const unsigned int channel_multiplier)
 {
-    if(sizeof(T) == 4 && channel_multiplier == 6)
+    if (sizeof(T) == 4 && channel_multiplier == 6)
     {
-        do_premultiply_float_6(
-            (const float *)in_ptr, ld_row, ld_col,
-            (float *)out_ptr, out_ld_row, out_ld_col,
-            tile_rows, tile_cols,
-            input_channels);
+        do_premultiply_float_6((const float *)in_ptr, ld_row, ld_col, (float *)out_ptr, out_ld_row, out_ld_col,
+                               tile_rows, tile_cols, input_channels);
     }
     else
     {
-        for(unsigned int i = 0; i < tile_rows; i++)
+        for (unsigned int i = 0; i < tile_rows; i++)
         {
             const T *ip2 = in_ptr + i * ld_row;
             T       *op2 = out_ptr + i * out_ld_row;
-            for(unsigned int j = 0; j < tile_cols; j++)
+            for (unsigned int j = 0; j < tile_cols; j++)
             {
                 const T *ip = ip2;
                 T       *op = op2;
-                for(unsigned int c = 0; c < input_channels; c++)
+                for (unsigned int c = 0; c < input_channels; c++)
                 {
                     T val = *ip;
                     ip++;
 
-                    for(unsigned int r = 0; r < channel_multiplier; r++)
+                    for (unsigned int r = 0; r < channel_multiplier; r++)
                     {
                         op[r] = val;
                     }

diff --git a/src/core/NEON/kernels/assembly/winograd.hpp b/src/core/NEON/kernels/assembly/winograd.hpp
index 5029075..dbf95d2 100644
--- a/src/core/NEON/kernels/assembly/winograd.hpp
+++ b/src/core/NEON/kernels/assembly/winograd.hpp

@@ -45,17 +45,24 @@
     Shape2D              kernel_shape;
     arm_gemm::Activation activation;
 
-    ConvolutionArgs(
-        unsigned int   n_batches,
-        const Shape2D &input_shape,
-        unsigned int   n_input_channels,
-        unsigned int pad_top, unsigned int pad_left,
-        const Shape2D              &output_shape,
-        unsigned int                n_output_channels,
-        const Shape2D               kernel_shape,
-        const arm_gemm::Activation &activation = {})
-        : n_batches(n_batches), input_shape(input_shape), n_input_channels(n_input_channels), pad_top(pad_top), pad_left(pad_left), output_shape(output_shape), n_output_channels(n_output_channels),
-          kernel_shape(kernel_shape), activation(activation)
+    ConvolutionArgs(unsigned int                n_batches,
+                    const Shape2D              &input_shape,
+                    unsigned int                n_input_channels,
+                    unsigned int                pad_top,
+                    unsigned int                pad_left,
+                    const Shape2D              &output_shape,
+                    unsigned int                n_output_channels,
+                    const Shape2D               kernel_shape,
+                    const arm_gemm::Activation &activation = {})
+        : n_batches(n_batches),
+          input_shape(input_shape),
+          n_input_channels(n_input_channels),
+          pad_top(pad_top),
+          pad_left(pad_left),
+          output_shape(output_shape),
+          n_output_channels(n_output_channels),
+          kernel_shape(kernel_shape),
+          activation(activation)
     {
     }
 };
@@ -105,23 +112,30 @@
     virtual unsigned int get_transformed_tile_rows(void) const = 0;
     virtual unsigned int get_transformed_tile_cols(void) const = 0;
 
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
-        void *outptr, const WinogradDomainSpec &wds,
-        unsigned int thread_id, unsigned int n_threads) const
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 size_t                    ld_input_channel,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args, inptr, ld_in_row, ld_in_col, ld_input_channel,
-            outptr, wds.weight_ld_matrix, wds.weight_ld_row,
-            thread_id, n_threads);
+        this->execute(args, inptr, ld_in_row, ld_in_col, ld_input_channel, outptr, wds.weight_ld_matrix,
+                      wds.weight_ld_row, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_input_channel,
-        void *outptr, size_t ld_out_matrix, size_t ld_out_row,
-        unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         size_t                 ld_input_channel,
+                         void                  *outptr,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace weight_transform
@@ -136,27 +150,35 @@
     virtual unsigned int get_input_rows(void) const = 0;
     virtual unsigned int get_input_cols(void) const = 0;
 
-    virtual size_t get_working_space_size(
-        const ConvolutionArgs &args,
-        unsigned int           n_threads) const = 0;
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
 
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
-        void *outptr, const WinogradDomainSpec &wds,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 size_t                    ld_in_batch,
+                 size_t                    ld_in_row,
+                 size_t                    ld_in_col,
+                 void                     *outptr,
+                 const WinogradDomainSpec &wds,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args, inptr, ld_in_batch, ld_in_row, ld_in_col,
-            outptr, wds.input_ld_batch, wds.input_ld_matrix, wds.input_ld_row,
-            working_space, thread_id, n_threads);
+        this->execute(args, inptr, ld_in_batch, ld_in_row, ld_in_col, outptr, wds.input_ld_batch, wds.input_ld_matrix,
+                      wds.input_ld_row, working_space, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_row, size_t ld_in_col,
-        void *outptr, size_t ld_out_batch, size_t ld_out_matrix, size_t ld_out_row,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_row,
+                         size_t                 ld_in_col,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_matrix,
+                         size_t                 ld_out_row,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace input_transform
@@ -177,31 +199,37 @@
     virtual unsigned int get_kernel_rows(void) const = 0;
     virtual unsigned int get_kernel_cols(void) const = 0;
 
-    virtual size_t get_working_space_size(
-        const ConvolutionArgs &args,
-        unsigned int           n_threads) const = 0;
+    virtual size_t get_working_space_size(const ConvolutionArgs &args, unsigned int n_threads) const = 0;
 
-    void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, const WinogradDomainSpec &wds,
-        const void *bias,
-        void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const
+    void execute(const ConvolutionArgs    &args,
+                 const void               *inptr,
+                 const WinogradDomainSpec &wds,
+                 const void               *bias,
+                 void                     *outptr,
+                 size_t                    ld_out_batch,
+                 size_t                    ld_out_row,
+                 size_t                    ld_out_col,
+                 void                     *working_space,
+                 unsigned int              thread_id,
+                 unsigned int              n_threads) const
     {
-        this->execute(
-            args,
-            inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row,
-            bias,
-            outptr, ld_out_batch, ld_out_row, ld_out_col,
-            working_space, thread_id, n_threads);
+        this->execute(args, inptr, wds.output_ld_batch, wds.output_ld_matrix, wds.output_ld_row, bias, outptr,
+                      ld_out_batch, ld_out_row, ld_out_col, working_space, thread_id, n_threads);
     }
 
-    virtual void execute(
-        const ConvolutionArgs &args,
-        const void *inptr, size_t ld_in_batch, size_t ld_in_matrix, size_t ld_in_row,
-        const void *bias,
-        void *outptr, size_t ld_out_batch, size_t ld_out_row, size_t ld_out_col,
-        void *working_space, unsigned int thread_id, unsigned int n_threads) const = 0;
+    virtual void execute(const ConvolutionArgs &args,
+                         const void            *inptr,
+                         size_t                 ld_in_batch,
+                         size_t                 ld_in_matrix,
+                         size_t                 ld_in_row,
+                         const void            *bias,
+                         void                  *outptr,
+                         size_t                 ld_out_batch,
+                         size_t                 ld_out_row,
+                         size_t                 ld_out_col,
+                         void                  *working_space,
+                         unsigned int           thread_id,
+                         unsigned int           n_threads) const = 0;
 };
 
 } // namespace output_transform
@@ -210,7 +238,7 @@
 {
     const output_transform::ITransform *output_transform = nullptr;
     const weight_transform::ITransform *weight_transform = nullptr;
-    const input_transform::ITransform *input_transform  = nullptr;
+    const input_transform::ITransform  *input_transform  = nullptr;
     std::unique_ptr<arm_gemm::GemmArgs> gemm_args;
     WinogradDomainSpec                  winograd_spec;
 };
@@ -220,15 +248,18 @@
  * Assigns to the pointers in the `dest` struct and returns true or false to
  * indicate whether the given problem can be executed or not.
  */
-template <typename TIn, typename TWeight = TIn, typename TOut = TIn, typename TWinogradIn = TIn, typename TWinogradOut = TOut>
-bool get_implementation(
-    WinogradImpl &dest, // Destination for the selected implementation
-    const CPUInfo *,
-    const ConvolutionArgs &,
-    int  max_threads,
-    bool fast_mode,
-    const WinogradConfig *,
-    const arm_gemm::GemmConfig *);
+template <typename TIn,
+          typename TWeight      = TIn,
+          typename TOut         = TIn,
+          typename TWinogradIn  = TIn,
+          typename TWinogradOut = TOut>
+bool get_implementation(WinogradImpl &dest, // Destination for the selected implementation
+                        const CPUInfo *,
+                        const ConvolutionArgs &,
+                        int  max_threads,
+                        bool fast_mode,
+                        const WinogradConfig *,
+                        const arm_gemm::GemmConfig *);
 
 } // namespace winograd
 } // namespace arm_conv

diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
index ed5254a..e3d9b67 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp16.cpp

@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
@@ -37,12 +38,26 @@
 {
 namespace
 {
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                      float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor             *src,
+                                      ITensor             *dst,
+                                      const ITensor       *mean,
+                                      const ITensor       *var,
+                                      const ITensor       *beta,
+                                      const ITensor       *gamma,
+                                      float                epsilon,
+                                      ActivationLayerInfo &act_info,
+                                      const Window        &window);
 
 template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                         float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor             *src,
+                         ITensor             *dst,
+                         const ITensor       *mean,
+                         const ITensor       *var,
+                         const ITensor       *beta,
+                         const ITensor       *gamma,
+                         float                epsilon,
+                         ActivationLayerInfo &act_info,
+                         const Window        &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
@@ -57,86 +72,99 @@
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T activation_functor(act_info);
 
     const auto epsilon_vec = wrapper::vdup_n(static_cast<float16_t>(epsilon), ExactTagType{});
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = wrapper::vloadq(input_mean + x);
-            const auto var_vec   = wrapper::vloadq(input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
-            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
 
-            // Calculate denominator
-            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const auto mean_vec  = wrapper::vloadq(input_mean + x);
+                const auto var_vec   = wrapper::vloadq(input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr)
+                                           ? wrapper::vloadq(input_gamma + x)
+                                           : wrapper::vdup_n(static_cast<float16_t>(1.f), ExactTagType{});
+                const auto beta_vec  = (input_beta != nullptr)
+                                           ? wrapper::vloadq(input_beta + x)
+                                           : wrapper::vdup_n(static_cast<float16_t>(0.f), ExactTagType{});
+
+                // Calculate denominator
+                const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Conctruct vectors
-            const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
-            const float16_t beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
-            const float16_t denominator = sqrt(input_var[x] + epsilon);
-            const float16_t numerator   = input_ptr[x] - input_mean[x];
-            const float16_t x_bar       = numerator / denominator;
-            float16_t       res         = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
-            }
+                // Conctruct vectors
+                const float16_t gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+                const float16_t beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
 
-            // Store results
-            *reinterpret_cast<float16_t *>(output_ptr + x) = res;
-        }
-    },
-    input, output);
+                const float16_t denominator = sqrt(input_var[x] + epsilon);
+                const float16_t numerator   = input_ptr[x] - input_mean[x];
+                const float16_t x_bar       = numerator / denominator;
+                float16_t       res         = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *reinterpret_cast<float16_t *>(output_ptr + x) = res;
+            }
+        },
+        input, output);
 }
 
 // Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
-    { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>> },
-    { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>> },
-    { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+    {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float16_t, 8>>},
+    {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float16_t, 8>>},
+    {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float16_t, 8>>}};
+} // namespace
 namespace cpu
 {
-void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_neon_batch_normalization(ITensor             *src,
+                                   ITensor             *dst,
+                                   const ITensor       *mean,
+                                   const ITensor       *var,
+                                   const ITensor       *beta,
+                                   const ITensor       *gamma,
+                                   float                epsilon,
+                                   ActivationLayerInfo &act_info,
+                                   const Window        &window)
 {
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
     }

diff --git a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
index d6e22e1..4e1654e 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/NEON/fp32.cpp

@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEMath.h"
+
 #include "src/core/NEON/kernels/detail/NEActivationFunctionDetail.h"
+#include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
@@ -36,12 +37,26 @@
 {
 namespace
 {
-using BatchNomalizationPtr = void (*)(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                      float epsilon, ActivationLayerInfo &act_info, const Window &window);
+using BatchNomalizationPtr = void (*)(ITensor             *src,
+                                      ITensor             *dst,
+                                      const ITensor       *mean,
+                                      const ITensor       *var,
+                                      const ITensor       *beta,
+                                      const ITensor       *gamma,
+                                      float                epsilon,
+                                      ActivationLayerInfo &act_info,
+                                      const Window        &window);
 
 template <typename T>
-void batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                         float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void batch_normalization(ITensor             *src,
+                         ITensor             *dst,
+                         const ITensor       *mean,
+                         const ITensor       *var,
+                         const ITensor       *beta,
+                         const ITensor       *gamma,
+                         float                epsilon,
+                         ActivationLayerInfo &act_info,
+                         const Window        &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
@@ -56,86 +71,99 @@
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     T activation_functor(act_info);
 
     const auto epsilon_vec = wrapper::vdup_n(static_cast<float>(epsilon), ExactTagType{});
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Perform core calculations using vector operations
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = wrapper::vloadq(input_mean + x);
-            const auto var_vec   = wrapper::vloadq(input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? wrapper::vloadq(input_gamma + x) : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
-            const auto beta_vec  = (input_beta != nullptr) ? wrapper::vloadq(input_beta + x) : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
 
-            // Calculate denominator
-            const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-
-            // Calculate x bar
-            const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
-            const auto x_bar     = wrapper::vmul(numerator, denominator);
-            auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Perform core calculations using vector operations
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                activation_functor(res);
+                // Conctruct vectors
+                const auto mean_vec  = wrapper::vloadq(input_mean + x);
+                const auto var_vec   = wrapper::vloadq(input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr)
+                                           ? wrapper::vloadq(input_gamma + x)
+                                           : wrapper::vdup_n(static_cast<float>(1.f), ExactTagType{});
+                const auto beta_vec  = (input_beta != nullptr)
+                                           ? wrapper::vloadq(input_beta + x)
+                                           : wrapper::vdup_n(static_cast<float>(0.f), ExactTagType{});
+
+                // Calculate denominator
+                const auto denominator = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+
+                // Calculate x bar
+                const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
+                const auto x_bar     = wrapper::vmul(numerator, denominator);
+                auto       res       = wrapper::vmla(beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                wrapper::vstore(output_ptr + x, res);
             }
 
-            // Store results
-            wrapper::vstore(output_ptr + x, res);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Conctruct vectors
-            const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
-            const float beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
-
-            const float denominator = sqrt(input_var[x] + epsilon);
-            const float numerator   = input_ptr[x] - input_mean[x];
-            const float x_bar       = numerator / denominator;
-            float       res         = beta + x_bar * gamma;
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                activation_functor(res);
-            }
+                // Conctruct vectors
+                const float gamma = (input_gamma != nullptr) ? input_gamma[x] : 1.f;
+                const float beta  = (input_beta != nullptr) ? input_beta[x] : 0.f;
 
-            // Store results
-            *reinterpret_cast<float *>(output_ptr + x) = res;
-        }
-    },
-    input, output);
+                const float denominator = sqrt(input_var[x] + epsilon);
+                const float numerator   = input_ptr[x] - input_mean[x];
+                const float x_bar       = numerator / denominator;
+                float       res         = beta + x_bar * gamma;
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    activation_functor(res);
+                }
+
+                // Store results
+                *reinterpret_cast<float *>(output_ptr + x) = res;
+            }
+        },
+        input, output);
 }
 
 // Fused Batched Normalization with activation functions
-static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map =
-{
-    { ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>> },
-    { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>> },
-    { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>> }
-};
-}
+static std::map<ActivationLayerInfo::ActivationFunction, BatchNomalizationPtr> fused_map = {
+    {ActivationLayerInfo::ActivationFunction::RELU, &batch_normalization<detail::relu<float, 4>>},
+    {ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &batch_normalization<detail::brelu<float, 4>>},
+    {ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &batch_normalization<detail::lubrelu<float, 4>>}};
+} // namespace
 namespace cpu
 {
-void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_neon_batch_normalization(ITensor             *src,
+                                   ITensor             *dst,
+                                   const ITensor       *mean,
+                                   const ITensor       *var,
+                                   const ITensor       *beta,
+                                   const ITensor       *gamma,
+                                   float                epsilon,
+                                   ActivationLayerInfo &act_info,
+                                   const Window        &window)
 {
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         fused_map[act_info.activation()](src, dst, mean, var, beta, gamma, epsilon, act_info, window);
     }

diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
index 98cd9aa..48caaa3 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp16.cpp

@@ -25,6 +25,7 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
 #include <cmath>
@@ -37,8 +38,15 @@
 {
 namespace cpu
 {
-void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                  float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp16_sve_batch_normalization(ITensor             *src,
+                                  ITensor             *dst,
+                                  const ITensor       *mean,
+                                  const ITensor       *var,
+                                  const ITensor       *beta,
+                                  const ITensor       *gamma,
+                                  float                epsilon,
+                                  ActivationLayerInfo &act_info,
+                                  const Window        &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float16_t *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float16_t *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float16_t *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float16_t *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     const auto epsilon_vec = svdup_n_f16(epsilon);
     const auto const_1     = svdup_n_f16(1.f);
     const auto const_0     = svdup_n_f16(0.f);
     const auto va          = svdup_n_f16(act_info.a());
     const auto vb          = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = svld1_f16(pg, input_mean + x);
-            const auto var_vec   = svld1_f16(pg, input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
-            const auto beta_vec  = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
 
-            // Calculate denominator
-            const auto tmp         = svadd_f16_z(pg, var_vec, epsilon_vec);
-            auto       denominator = svrsqrte_f16(tmp);
-            denominator            = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
-            denominator            = svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
-
-            // Calculate x bar
-            const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
-            const auto x_bar     = svmul_f16_z(pg, numerator, denominator);
-            auto       res       = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
             {
-                if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
-                {
-                    res = svmax_f16_z(pg, const_0, res);
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                {
-                    res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-                {
-                    res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
-                }
-            }
+                // Conctruct vectors
+                const auto mean_vec  = svld1_f16(pg, input_mean + x);
+                const auto var_vec   = svld1_f16(pg, input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr) ? svld1_f16(pg, input_gamma + x) : const_1;
+                const auto beta_vec  = (input_beta != nullptr) ? svld1_f16(pg, input_beta + x) : const_0;
 
-            // Store results
-            svst1_f16(pg, output_ptr + x, res);
+                // Calculate denominator
+                const auto tmp         = svadd_f16_z(pg, var_vec, epsilon_vec);
+                auto       denominator = svrsqrte_f16(tmp);
+                denominator =
+                    svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
+                denominator =
+                    svmul_f16_z(pg, svrsqrts_f16(svmul_f16_z(pg, tmp, denominator), denominator), denominator);
 
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+                // Calculate x bar
+                const auto numerator = svsub_f16_z(pg, svld1_f16(pg, input_ptr + x), mean_vec);
+                const auto x_bar     = svmul_f16_z(pg, numerator, denominator);
+                auto       res       = svmla_f16_z(pg, beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+                    {
+                        res = svmax_f16_z(pg, const_0, res);
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                    {
+                        res = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, res));
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                    {
+                        res = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, res));
+                    }
+                }
+
+                // Store results
+                svst1_f16(pg, output_ptr + x, res);
+
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
index 952ab32..df4fbfe 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp
+++ b/src/core/NEON/kernels/batchnormalization/impl/SVE/fp32.cpp

@@ -25,6 +25,7 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
 #include <cmath>
@@ -37,8 +38,15 @@
 {
 namespace cpu
 {
-void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma,
-                                  float epsilon, ActivationLayerInfo &act_info, const Window &window)
+void fp32_sve_batch_normalization(ITensor             *src,
+                                  ITensor             *dst,
+                                  const ITensor       *mean,
+                                  const ITensor       *var,
+                                  const ITensor       *beta,
+                                  const ITensor       *gamma,
+                                  float                epsilon,
+                                  ActivationLayerInfo &act_info,
+                                  const Window        &window)
 {
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
@@ -49,69 +57,74 @@
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
 
-    const auto input_mean  = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
-    const auto input_var   = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_mean = reinterpret_cast<const float *>(mean->ptr_to_element(Coordinates(0, 0)));
+    const auto input_var  = reinterpret_cast<const float *>(var->ptr_to_element(Coordinates(0, 0)));
+    const auto input_gamma =
+        (gamma != nullptr) ? reinterpret_cast<const float *>(gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_beta =
+        (beta != nullptr) ? reinterpret_cast<const float *>(beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
 
     const auto epsilon_vec = svdup_n_f32(epsilon);
     const auto const_1     = svdup_n_f32(1.f);
     const auto const_0     = svdup_n_f32(0.f);
     const auto va          = svdup_n_f32(act_info.a());
     const auto vb          = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            // Conctruct vectors
-            const auto mean_vec  = svld1_f32(pg, input_mean + x);
-            const auto var_vec   = svld1_f32(pg, input_var + x);
-            const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
-            const auto beta_vec  = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
 
-            // Calculate denominator
-            const auto tmp         = svadd_f32_z(pg, var_vec, epsilon_vec);
-            auto       denominator = svrsqrte_f32(tmp);
-            denominator            = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
-            denominator            = svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
-
-            // Calculate x bar
-            const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
-            const auto x_bar     = svmul_f32_z(pg, numerator, denominator);
-            auto       res       = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
-
-            // Perform fused activation
-            if(act_info.enabled())
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
             {
-                if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
-                {
-                    res = svmax_f32_z(pg, const_0, res);
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-                {
-                    res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
-                }
-                else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-                {
-                    res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
-                }
-            }
+                // Conctruct vectors
+                const auto mean_vec  = svld1_f32(pg, input_mean + x);
+                const auto var_vec   = svld1_f32(pg, input_var + x);
+                const auto gamma_vec = (input_gamma != nullptr) ? svld1_f32(pg, input_gamma + x) : const_1;
+                const auto beta_vec  = (input_beta != nullptr) ? svld1_f32(pg, input_beta + x) : const_0;
 
-            // Store results
-            svst1_f32(pg, output_ptr + x, res);
+                // Calculate denominator
+                const auto tmp         = svadd_f32_z(pg, var_vec, epsilon_vec);
+                auto       denominator = svrsqrte_f32(tmp);
+                denominator =
+                    svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
+                denominator =
+                    svmul_f32_z(pg, svrsqrts_f32(svmul_f32_z(pg, tmp, denominator), denominator), denominator);
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
+                // Calculate x bar
+                const auto numerator = svsub_f32_z(pg, svld1_f32(pg, input_ptr + x), mean_vec);
+                const auto x_bar     = svmul_f32_z(pg, numerator, denominator);
+                auto       res       = svmla_f32_z(pg, beta_vec, x_bar, gamma_vec);
+
+                // Perform fused activation
+                if (act_info.enabled())
+                {
+                    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+                    {
+                        res = svmax_f32_z(pg, const_0, res);
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                    {
+                        res = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, res));
+                    }
+                    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                    {
+                        res = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, res));
+                    }
+                }
+
+                // Store results
+                svst1_f32(pg, output_ptr + x, res);
+
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/batchnormalization/impl/list.h b/src/core/NEON/kernels/batchnormalization/impl/list.h
index 8e0ea36..cbf540b 100644
--- a/src/core/NEON/kernels/batchnormalization/impl/list.h
+++ b/src/core/NEON/kernels/batchnormalization/impl/list.h

@@ -28,9 +28,9 @@
 {
 namespace cpu
 {
-#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name)                                                                              \
-    void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, \
-                   float epsilon, ActivationLayerInfo &act_info, const Window &window)
+#define DECLARE_BATCH_NORMALIZATION_KERNEL(func_name)                                                        \
+    void func_name(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, \
+                   const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
 
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_neon_batch_normalization);
 DECLARE_BATCH_NORMALIZATION_KERNEL(fp16_sve_batch_normalization);

diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
index 3900ea6..95cdc8f 100644
--- a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h
+++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -158,8 +159,7 @@
      *
      * @param[in] act_info Activation layer information.
      */
-    explicit logistic(ActivationLayerInfo act_info)
-        : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
+    explicit logistic(ActivationLayerInfo act_info) : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }
@@ -198,8 +198,7 @@
      *
      * @param[in] act_info Activation layer information.
      */
-    explicit relu(ActivationLayerInfo act_info)
-        : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
+    explicit relu(ActivationLayerInfo act_info) : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{}))
     {
         ARM_COMPUTE_UNUSED(act_info);
     }

diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
index ac196d9..50fff04 100644
--- a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl
+++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl

@@ -25,6 +25,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IMultiImage.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/NEON/NEMath.h"
 
 #include <arm_neon.h>
@@ -50,8 +51,12 @@
 constexpr float rgb2u8_green_coef = 0.7152f;
 constexpr float rgb2u8_blue_coef  = 0.0722f;
 
-inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor,
-                                                const float rcoef, const float gcoef, const float bcoef)
+inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor,
+                                                const float32x4_t &gcolor,
+                                                const float32x4_t &bcolor,
+                                                const float        rcoef,
+                                                const float        gcoef,
+                                                const float        bcoef)
 {
     float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef);
     greyscale             = vmlaq_n_f32(greyscale, gcolor, gcoef);
@@ -86,8 +91,12 @@
     arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out);
 }
 
-inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec,
-                                   float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec)
+inline void rgb_to_yuv_calculation(const float32x4_t &rvec,
+                                   const float32x4_t &gvec,
+                                   const float32x4_t &bvec,
+                                   float32x4_t       &yvec,
+                                   float32x4_t       &uvec,
+                                   float32x4_t       &vvec)
 {
     /*
     Y'= 0.2126*R' + 0.7152*G' + 0.0722*B'
@@ -110,8 +119,12 @@
     vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv);
 }
 
-inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val,
-                                    float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha)
+inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val,
+                                    float32x4_t        uvec_val,
+                                    const float32x4_t &yyvec_val,
+                                    float32x4_t        vvec_val,
+                                    unsigned char     *output_ptr,
+                                    const bool         alpha)
 {
     float32x4x3_t rgb1, rgb2;
 
@@ -126,8 +139,7 @@
     // b = 1.8556f*f_u + 0.0000f*f_v;
     const auto red   = vmulq_n_f32(vvec_val, red_coef_bt709);
     const auto blue  = vmulq_n_f32(uvec_val, blue_coef_bt709);
-    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709),
-                                 vmulq_n_f32(vvec_val, green_coef2_bt709));
+    const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), vmulq_n_f32(vvec_val, green_coef2_bt709));
 
     // Compute the final r,g,b values using y1 for the first texel and y2 for the second one.
     // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t
@@ -144,7 +156,7 @@
     uint8x8x3_t u8_rgb;
     arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb);
 
-    if(!alpha)
+    if (!alpha)
     {
         vst3_lane_u8(&output_ptr[0], u8_rgb, 0);
         vst3_lane_u8(&output_ptr[3], u8_rgb, 4);
@@ -177,7 +189,7 @@
 {
     uint8x16x3_t rgb;
 
-    if(alpha)
+    if (alpha)
     {
         const auto tmp = vld4q_u8(ptr);
         rgb.val[0]     = tmp.val[0];
@@ -206,12 +218,12 @@
     float32x4x4_t fyvec_top, fuvec_top, fvvec_top;
     float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom;
 
-    for(auto i = 0; i < 4; ++i)
+    for (auto i = 0; i < 4; ++i)
     {
-        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i],
-                               fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]);
-        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i],
-                               fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]);
+        rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], fyvec_top.val[i], fuvec_top.val[i],
+                               fvvec_top.val[i]);
+        rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], fyvec_bottom.val[i],
+                               fuvec_bottom.val[i], fvvec_bottom.val[i]);
     }
 
     arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]);
@@ -222,9 +234,14 @@
     arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]);
 }
 
-inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_nv12(const uint8x16_t &rvec_top,
+                              const uint8x16_t &gvec_top,
+                              const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom,
+                              const uint8x16_t &gvec_bottom,
+                              const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top,
+                              unsigned char *const __restrict out_y_bottom,
                               unsigned char *const __restrict out_uv)
 {
     uint8x16x3_t vec_top, vec_bottom;
@@ -252,9 +269,14 @@
     vst2_u8(out_uv, uvvec);
 }
 
-inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top,
-                              const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom,
-                              unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom,
+inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top,
+                              const uint8x16_t &gvec_top,
+                              const uint8x16_t &bvec_top,
+                              const uint8x16_t &rvec_bottom,
+                              const uint8x16_t &gvec_bottom,
+                              const uint8x16_t &bvec_bottom,
+                              unsigned char *const __restrict out_y_top,
+                              unsigned char *const __restrict out_y_bottom,
                               unsigned char *const __restrict out_u,
                               unsigned char *const __restrict out_v)
 {
@@ -273,14 +295,16 @@
 
     const auto uvvec_top    = vuzpq_u8(vec_top.val[1], vec_top.val[2]);
     const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]);
-    const auto uvvec        = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]),
-                                        vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
+    const auto uvvec =
+        vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1]));
 
     vst1_u8(out_u, vget_low_u8(uvvec));
     vst1_u8(out_v, vget_high_u8(uvvec));
 }
 
-inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec,
+inline void store_rgb_to_yuv4(const uint8x16_t &rvec,
+                              const uint8x16_t &gvec,
+                              const uint8x16_t &bvec,
                               unsigned char *const __restrict out_y,
                               unsigned char *const __restrict out_u,
                               unsigned char *const __restrict out_v)
@@ -291,10 +315,9 @@
     const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec);
 
     float32x4x4_t fyvec, fuvec, fvvec;
-    for(auto i = 0; i < 4; ++i)
+    for (auto i = 0; i < 4; ++i)
     {
-        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i],
-                               fyvec.val[i], fuvec.val[i], fvvec.val[i]);
+        rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], fyvec.val[i], fuvec.val[i], fvvec.val[i]);
     }
 
     uint8x16_t yvec, uvec, vvec;
@@ -307,7 +330,7 @@
     vst1q_u8(out_v, vvec);
 }
 #endif /* DOXYGEN_SKIP_THIS */
-}
+} // namespace
 
 namespace arm_compute
 {
@@ -329,17 +352,19 @@
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld3q_u8(in.ptr());
-        uint8x16x4_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        ta2.val[3] = vdupq_n_u8(255);
-        vst4q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta1 = vld3q_u8(in.ptr());
+            uint8x16x4_t ta2;
+            ta2.val[0] = ta1.val[0];
+            ta2.val[1] = ta1.val[1];
+            ta2.val[2] = ta1.val[2];
+            ta2.val[3] = vdupq_n_u8(255);
+            vst4q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert RGB to U8.
@@ -360,14 +385,16 @@
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta1 = vld3q_u8(in.ptr());
-        uint8x16_t ta2;
-        rgb_to_u8_conversion(ta1, ta2);
-        vst1q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta1 = vld3q_u8(in.ptr());
+            uint8x16_t ta2;
+            rgb_to_u8_conversion(ta1, ta2);
+            vst1q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert RGBX to RGB.
@@ -388,16 +415,18 @@
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta1 = vld4q_u8(in.ptr());
-        uint8x16x3_t ta2;
-        ta2.val[0] = ta1.val[0];
-        ta2.val[1] = ta1.val[1];
-        ta2.val[2] = ta1.val[2];
-        vst3q_u8(out.ptr(), ta2);
-    },
-    in, out);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta1 = vld4q_u8(in.ptr());
+            uint8x16x3_t ta2;
+            ta2.val[0] = ta1.val[0];
+            ta2.val[1] = ta1.val[1];
+            ta2.val[2] = ta1.val[2];
+            vst3q_u8(out.ptr(), ta2);
+        },
+        in, out);
 }
 
 /** Convert YUYV to RGB.
@@ -422,26 +451,32 @@
     Iterator in(input_ptr, win);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta = vld4q_u8(in.ptr());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta = vld4q_u8(in.ptr());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
 
-        // Convert the uint8x16x4_t to float32x4x4_t
-        const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
-        const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
-        const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
-        const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
+            // Convert the uint8x16x4_t to float32x4x4_t
+            const float32x4x4_t yvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]);
+            const float32x4x4_t uvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]);
+            const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]);
+            const float32x4x4_t vvec  = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]);
 
-        yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
-    },
-    in, out);
+            yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size,
+                                    alpha);
+            yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size,
+                                    alpha);
+        },
+        in, out);
 }
 
 /** Convert NV12 to RGB.
@@ -475,35 +510,45 @@
     Iterator in_uv(input_ptr->plane(1), win_uv);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
 
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+            float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+            float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+            float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+            float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]);
+            float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]);
 
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+                                    out.ptr() + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+                                    out.ptr() + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+                                    out.ptr() + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+                                    out.ptr() + 3 * element_size, alpha);
 
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_uv, out);
+            yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+                                    out.ptr() + out_stride + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+                                    out.ptr() + out_stride + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+                                    out.ptr() + out_stride + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+                                    out.ptr() + out_stride + 3 * element_size, alpha);
+        },
+        in_y, in_uv, out);
 }
 
 /** Convert IYUV to RGB.
@@ -537,59 +582,71 @@
     Iterator in_v(input_ptr->plane(2), win_uv);
     Iterator out(output_ptr, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto *y_top_ptr    = in_y.ptr();
-        const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
-        const auto *u_ptr        = in_u.ptr();
-        const auto *v_ptr        = in_v.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto *y_top_ptr    = in_y.ptr();
+            const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y();
+            const auto *u_ptr        = in_u.ptr();
+            const auto *v_ptr        = in_v.ptr();
 
         // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation
 #if defined(__arch64__)
-        const auto ta0_y_top    = vld1q_u8(y_top_ptr);
-        const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
-        const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
-        const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
-        const auto ta_u         = vld1q_u8(u_ptr);
-        const auto ta_v         = vld1q_u8(v_ptr);
+            const auto ta0_y_top    = vld1q_u8(y_top_ptr);
+            const auto ta1_y_top    = vld1q_u8(y_top_ptr + 16);
+            const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr);
+            const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16);
+            const auto ta_u         = vld1q_u8(u_ptr);
+            const auto ta_v         = vld1q_u8(v_ptr);
 
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top  = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top));
+            float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top));
+            float32x4x4_t yvec_bottom =
+                arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom));
+            float32x4x4_t yyvec_bottom =
+                arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom));
+            float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+            float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
 #else  /* defined(__arch64__) */
-        const auto ta_y_top    = vld2q_u8(y_top_ptr);
-        const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
-        const auto ta_u        = vld1q_u8(u_ptr);
-        const auto ta_v        = vld1q_u8(v_ptr);
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u.val[0] = U0 U2 U4 U6 ...
-        //ta_v.val[0] = V0 V2 V4 V6 ...
+            const auto ta_y_top    = vld2q_u8(y_top_ptr);
+            const auto ta_y_bottom = vld2q_u8(y_bottom_ptr);
+            const auto ta_u        = vld1q_u8(u_ptr);
+            const auto ta_v        = vld1q_u8(v_ptr);
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_u.val[0] = U0 U2 U4 U6 ...
+            //ta_v.val[0] = V0 V2 V4 V6 ...
 
-        // Convert the uint8x16x4_t to float32x4x4_t
-        float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
-        float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
-        float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
-        float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
-        float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
-        float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
+            // Convert the uint8x16x4_t to float32x4x4_t
+            float32x4x4_t yvec_top     = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]);
+            float32x4x4_t yyvec_top    = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]);
+            float32x4x4_t yvec_bottom  = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]);
+            float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]);
+            float32x4x4_t uvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_u);
+            float32x4x4_t vvec         = arm_compute::convert_uint8x16_to_float32x4x4(ta_v);
 #endif /* defined(__arch64__) */
 
-        yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0],
+                                    out.ptr() + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1],
+                                    out.ptr() + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2],
+                                    out.ptr() + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3],
+                                    out.ptr() + 3 * element_size, alpha);
 
-        yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha);
-        yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha);
-    },
-    in_y, in_u, in_v, out);
+            yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0],
+                                    out.ptr() + out_stride + 0 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1],
+                                    out.ptr() + out_stride + 1 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2],
+                                    out.ptr() + out_stride + 2 * element_size, alpha);
+            yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3],
+                                    out.ptr() + out_stride + 3 * element_size, alpha);
+        },
+        in_y, in_u, in_v, out);
 }
 
 /** Convert YUYV to NV12.
@@ -621,31 +678,33 @@
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_top    = vld4q_u8(in.ptr());
+            const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
 
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
+            uint8x16x2_t yvec;
+            yvec.val[0] = ta_top.val[0 + shift];
+            yvec.val[1] = ta_top.val[2 + shift];
+            vst2q_u8(out_y.ptr(), yvec);
 
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+            uint8x16x2_t yyvec;
+            yyvec.val[0] = ta_bottom.val[0 + shift];
+            yyvec.val[1] = ta_bottom.val[2 + shift];
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
 
-        uint8x16x2_t uvvec;
-        uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst2q_u8(out_uv.ptr(), uvvec);
-    },
-    in, out_y, out_uv);
+            uint8x16x2_t uvvec;
+            uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+            uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+            vst2q_u8(out_uv.ptr(), uvvec);
+        },
+        in, out_y, out_uv);
 }
 
 /** Convert IYUV to NV12.
@@ -676,23 +735,25 @@
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto   ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        uint8x16x2_t ta_uv;
-        ta_uv.val[0] = vld1q_u8(in_u.ptr());
-        ta_uv.val[1] = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto   ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto   ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            uint8x16x2_t ta_uv;
+            ta_uv.val[0] = vld1q_u8(in_u.ptr());
+            ta_uv.val[1] = vld1q_u8(in_v.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
 
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst2q_u8(out_uv.ptr(), ta_uv);
-    },
-    in_y, in_u, in_v, out_y, out_uv);
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst2q_u8(out_uv.ptr(), ta_uv);
+        },
+        in_y, in_u, in_v, out_y, out_uv);
 }
 
 /** Convert NV12 to IYUV.
@@ -726,22 +787,24 @@
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
 
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
-        vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
-        vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]);
+            vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]);
+        },
+        in_y, in_uv, out_y, out_u, out_v);
 }
 
 /** Convert YUYV to IYUV.
@@ -774,34 +837,36 @@
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_top    = vld4q_u8(in.ptr());
-        const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
-        //ta.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta.val[1] = U0 U2 U4 U6 ...
-        //ta.val[2] = Y1 Y3 Y5 Y7 ...
-        //ta.val[3] = V0 V2 V4 V7 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_top    = vld4q_u8(in.ptr());
+            const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y());
+            //ta.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta.val[1] = U0 U2 U4 U6 ...
+            //ta.val[2] = Y1 Y3 Y5 Y7 ...
+            //ta.val[3] = V0 V2 V4 V7 ...
 
-        uint8x16x2_t yvec;
-        yvec.val[0] = ta_top.val[0 + shift];
-        yvec.val[1] = ta_top.val[2 + shift];
-        vst2q_u8(out_y.ptr(), yvec);
+            uint8x16x2_t yvec;
+            yvec.val[0] = ta_top.val[0 + shift];
+            yvec.val[1] = ta_top.val[2 + shift];
+            vst2q_u8(out_y.ptr(), yvec);
 
-        uint8x16x2_t yyvec;
-        yyvec.val[0] = ta_bottom.val[0 + shift];
-        yyvec.val[1] = ta_bottom.val[2 + shift];
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
+            uint8x16x2_t yyvec;
+            yyvec.val[0] = ta_bottom.val[0 + shift];
+            yyvec.val[1] = ta_bottom.val[2 + shift];
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec);
 
-        uint8x16_t uvec;
-        uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
-        vst1q_u8(out_u.ptr(), uvec);
+            uint8x16_t uvec;
+            uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]);
+            vst1q_u8(out_u.ptr(), uvec);
 
-        uint8x16_t vvec;
-        vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
-        vst1q_u8(out_v.ptr(), vvec);
-    },
-    in, out_y, out_u, out_v);
+            uint8x16_t vvec;
+            vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]);
+            vst1q_u8(out_v.ptr(), vvec);
+        },
+        in, out_y, out_u, out_v);
 }
 
 /** Convert NV12 to YUV4.
@@ -835,32 +900,34 @@
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_uv       = vld2q_u8(in_uv.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_uv.val[0] = U0 U2 U4 U6 ...
-        //ta_uv.val[1] = V0 V2 V4 V6 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_uv       = vld2q_u8(in_uv.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_uv.val[0] = U0 U2 U4 U6 ...
+            //ta_uv.val[1] = V0 V2 V4 V6 ...
 
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
 
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_uv.val[0 + shift];
-        uvec.val[1] = ta_uv.val[0 + shift];
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+            uint8x16x2_t uvec;
+            uvec.val[0] = ta_uv.val[0 + shift];
+            uvec.val[1] = ta_uv.val[0 + shift];
+            vst2q_u8(out_u.ptr(), uvec);
+            vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
 
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_uv.val[1 - shift];
-        vvec.val[1] = ta_uv.val[1 - shift];
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_uv, out_y, out_u, out_v);
+            uint8x16x2_t vvec;
+            vvec.val[0] = ta_uv.val[1 - shift];
+            vvec.val[1] = ta_uv.val[1 - shift];
+            vst2q_u8(out_v.ptr(), vvec);
+            vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+        },
+        in_y, in_uv, out_y, out_u, out_v);
 }
 
 /** Convert IYUV to YUV4.
@@ -892,33 +959,35 @@
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_y_top    = vld2q_u8(in_y.ptr());
-        const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
-        const auto ta_u        = vld1q_u8(in_u.ptr());
-        const auto ta_v        = vld1q_u8(in_v.ptr());
-        //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
-        //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
-        //ta_u = U0 U2 U4 U6 ...
-        //ta_v = V0 V2 V4 V6 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_y_top    = vld2q_u8(in_y.ptr());
+            const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y());
+            const auto ta_u        = vld1q_u8(in_u.ptr());
+            const auto ta_v        = vld1q_u8(in_v.ptr());
+            //ta_y.val[0] = Y0 Y2 Y4 Y6 ...
+            //ta_y.val[1] = Y1 Y3 Y5 Y7 ...
+            //ta_u = U0 U2 U4 U6 ...
+            //ta_v = V0 V2 V4 V6 ...
 
-        vst2q_u8(out_y.ptr(), ta_y_top);
-        vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
+            vst2q_u8(out_y.ptr(), ta_y_top);
+            vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom);
 
-        uint8x16x2_t uvec;
-        uvec.val[0] = ta_u;
-        uvec.val[1] = ta_u;
-        vst2q_u8(out_u.ptr(), uvec);
-        vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
+            uint8x16x2_t uvec;
+            uvec.val[0] = ta_u;
+            uvec.val[1] = ta_u;
+            vst2q_u8(out_u.ptr(), uvec);
+            vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec);
 
-        uint8x16x2_t vvec;
-        vvec.val[0] = ta_v;
-        vvec.val[1] = ta_v;
-        vst2q_u8(out_v.ptr(), vvec);
-        vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
-    },
-    in_y, in_u, in_v, out_y, out_u, out_v);
+            uint8x16x2_t vvec;
+            vvec.val[0] = ta_v;
+            vvec.val[1] = ta_v;
+            vst2q_u8(out_v.ptr(), vvec);
+            vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec);
+        },
+        in_y, in_u, in_v, out_y, out_u, out_v);
 }
 
 /** Convert RGB to NV12.
@@ -948,20 +1017,21 @@
     Iterator out_y(output_ptr->plane(0), win);
     Iterator out_uv(output_ptr->plane(1), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+            const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
 
-        store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_uv.ptr());
-    },
-    in, out_y, out_uv);
+            store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+                              ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+                              out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_uv.ptr());
+        },
+        in, out_y, out_uv);
 }
 
 /** Convert RGB to IYUV.
@@ -992,20 +1062,22 @@
     Iterator out_u(output_ptr->plane(1), win_uv);
     Iterator out_v(output_ptr->plane(2), win_uv);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
-        const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb_top    = load_rgb(in.ptr(), alpha);
+            const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
 
-        store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2],
-                          ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2],
-                          out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(),
-                          out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
+            store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], ta_rgb_bottom.val[0],
+                              ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], out_y.ptr(),
+                              out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), out_u.ptr(),
+                              out_v.ptr());
+        },
+        in, out_y, out_u, out_v);
 }
 
 /** Convert RGB to YUV4.
@@ -1030,16 +1102,17 @@
     Iterator out_u(output_ptr->plane(1), win);
     Iterator out_v(output_ptr->plane(2), win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto ta_rgb = load_rgb(in.ptr(), alpha);
-        //ta_rgb.val[0] = R0 R1 R2 R3 ...
-        //ta_rgb.val[1] = G0 G1 G2 G3 ...
-        //ta_rgb.val[2] = B0 B1 B2 B3 ...
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto ta_rgb = load_rgb(in.ptr(), alpha);
+            //ta_rgb.val[0] = R0 R1 R2 R3 ...
+            //ta_rgb.val[1] = G0 G1 G2 G3 ...
+            //ta_rgb.val[2] = B0 B1 B2 B3 ...
 
-        store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2],
-                          out_y.ptr(), out_u.ptr(), out_v.ptr());
-    },
-    in, out_y, out_u, out_v);
+            store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], out_y.ptr(), out_u.ptr(), out_v.ptr());
+        },
+        in, out_y, out_u, out_v);
 }
 } // namespace arm_compute

diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
index 96defbc..4b1eb07 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h

@@ -33,56 +33,32 @@
 {
 inline float32x4x3_t load_matrix_row(const float *ptr)
 {
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
+    const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }
 
 template <unsigned int stridex>
-float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2);
+float32x4x2_t convolve_3x3(const float         *in_top,
+                           const float         *in_mid,
+                           const float         *in_low,
+                           const float32x4x3_t &m0,
+                           const float32x4x3_t &m1,
+                           const float32x4x3_t &m2);
 
 template <>
-inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<1>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + 4),
-            vld1q_f32(in_top + 8)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + 4),
-            vld1q_f32(in_mid + 8)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + 4),
-            vld1q_f32(in_low + 8)
-        }
-    };
-    float32x4x2_t out =
-    {
-        {
-            vmulq_f32(vtop.val[0], m0.val[0]),
-            vmulq_f32(vtop.val[1], m0.val[0])
-        }
-    };
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
-    out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
+    const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+    const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+    const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+    float32x4x2_t       out  = {{vmulq_f32(vtop.val[0], m0.val[0]), vmulq_f32(vtop.val[1], m0.val[0])}};
+    out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
+    out.val[0]               = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
 
     out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
     out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
@@ -106,7 +82,12 @@
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<2>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
@@ -116,7 +97,12 @@
 }
 
 template <>
-inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
+inline float32x4x2_t convolve_3x3<3>(const float         *in_top,
+                                     const float         *in_mid,
+                                     const float         *in_low,
+                                     const float32x4x3_t &m0,
+                                     const float32x4x3_t &m1,
+                                     const float32x4x3_t &m2)
 {
     float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
     out.val[0]        = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
@@ -165,6 +151,6 @@
 {
     return num_elems_written_per_iteration * 3;
 }
-}
+} // namespace detail
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */

diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index 7ba52a1..fd1ee54 100644
--- a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h

@@ -45,14 +45,7 @@
 inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0)
 {
     ARM_COMPUTE_UNUSED(weights_offset);
-    const float32x4x3_t r =
-    {
-        {
-            vld1q_dup_f32(ptr),
-            vld1q_dup_f32(1 + ptr),
-            vld1q_dup_f32(2 + ptr)
-        }
-    };
+    const float32x4x3_t r = {{vld1q_dup_f32(ptr), vld1q_dup_f32(1 + ptr), vld1q_dup_f32(2 + ptr)}};
     return r;
 }
 
@@ -63,21 +56,16 @@
  *
  * @return The loaded matrix.
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
 inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0)
 {
     const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
 
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    int32x4x3_t r =
-    {
-        {
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
-            vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
-        }
-    };
+    int32x4x3_t r = {{vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
+                      vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
+                      vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))}};
     return r;
 }
 
@@ -245,36 +233,23 @@
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                                const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                                const size_t dilation_x, int input_offset)
+inline float32x4_t single_convolve_3x3_dilation(const float         *in_top,
+                                                const float         *in_mid,
+                                                const float         *in_low,
+                                                const float32x4x3_t &m0,
+                                                const float32x4x3_t &m1,
+                                                const float32x4x3_t &m2,
+                                                const size_t         dilation_x,
+                                                int                  input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    const float32x4x3_t vtop =
-    {
-        {
-            vld1q_f32(in_top),
-            vld1q_f32(in_top + dilation_x),
-            vld1q_f32(in_top + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vmid =
-    {
-        {
-            vld1q_f32(in_mid),
-            vld1q_f32(in_mid + dilation_x),
-            vld1q_f32(in_mid + 2 * dilation_x)
-        }
-    };
-    const float32x4x3_t vlow =
-    {
-        {
-            vld1q_f32(in_low),
-            vld1q_f32(in_low + dilation_x),
-            vld1q_f32(in_low + 2 * dilation_x)
-        }
-    };
+    const float32x4x3_t vtop = {
+        {vld1q_f32(in_top), vld1q_f32(in_top + dilation_x), vld1q_f32(in_top + 2 * dilation_x)}};
+    const float32x4x3_t vmid = {
+        {vld1q_f32(in_mid), vld1q_f32(in_mid + dilation_x), vld1q_f32(in_mid + 2 * dilation_x)}};
+    const float32x4x3_t vlow = {
+        {vld1q_f32(in_low), vld1q_f32(in_low + dilation_x), vld1q_f32(in_low + 2 * dilation_x)}};
     float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
     out             = vmlaq_f32(out, vtop.val[1], m0.val[1]);
     out             = vmlaq_f32(out, vtop.val[2], m0.val[2]);
@@ -303,26 +278,28 @@
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low,
-                                           const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
+inline float32x4x2_t convolve_3x3_dilation(const float         *in_top,
+                                           const float         *in_mid,
+                                           const float         *in_low,
+                                           const float32x4x3_t &m0,
+                                           const float32x4x3_t &m1,
+                                           const float32x4x3_t &m2,
+                                           const size_t         dilation_x,
+                                           unsigned int         stridex,
+                                           int                  input_offset = 0)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
-    float32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
+    float32x4x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
 
-    if(stridex == 2)
+    if (stridex == 2)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
     }
@@ -344,26 +321,32 @@
  *
  */
 template <bool accumulate>
-void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                  const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                  unsigned int stridex, int input_offset = 0);
+void convolve_3x3(const float         *in_top,
+                  const float         *in_mid,
+                  const float         *in_low,
+                  float               *out_ptr,
+                  const float32x4x3_t &m0,
+                  const float32x4x3_t &m1,
+                  const float32x4x3_t &m2,
+                  unsigned int         stridex,
+                  int                  input_offset = 0);
 
 template <bool accumulate>
-inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr,
-                         const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2,
-                         unsigned int stridex, int input_offset)
+inline void convolve_3x3(const float         *in_top,
+                         const float         *in_mid,
+                         const float         *in_low,
+                         float               *out_ptr,
+                         const float32x4x3_t &m0,
+                         const float32x4x3_t &m1,
+                         const float32x4x3_t &m2,
+                         unsigned int         stridex,
+                         int                  input_offset)
 {
     ARM_COMPUTE_UNUSED(input_offset);
     ARM_COMPUTE_ERROR_ON(stridex > 3);
 
-    float32x4x2_t out =
-    {
-        {
-            vdupq_n_f32(0.f),
-            vdupq_n_f32(0.f)
-        }
-    };
-    if(stridex == 2)
+    float32x4x2_t out = {{vdupq_n_f32(0.f), vdupq_n_f32(0.f)}};
+    if (stridex == 2)
     {
         const float32x4x2_t vtop     = vld2q_f32(in_top);
         const float32x4x2_t vmid     = vld2q_f32(in_mid);
@@ -389,32 +372,11 @@
     }
     else
     {
-        const float32x4x3_t vtop =
-        {
-            {
-                vld1q_f32(in_top),
-                vld1q_f32(in_top + 4),
-                vld1q_f32(in_top + 8)
-            }
-        };
-        const float32x4x3_t vmid =
-        {
-            {
-                vld1q_f32(in_mid),
-                vld1q_f32(in_mid + 4),
-                vld1q_f32(in_mid + 8)
-            }
-        };
-        const float32x4x3_t vlow =
-        {
-            {
-                vld1q_f32(in_low),
-                vld1q_f32(in_low + 4),
-                vld1q_f32(in_low + 8)
-            }
-        };
-        out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
+        const float32x4x3_t vtop = {{vld1q_f32(in_top), vld1q_f32(in_top + 4), vld1q_f32(in_top + 8)}};
+        const float32x4x3_t vmid = {{vld1q_f32(in_mid), vld1q_f32(in_mid + 4), vld1q_f32(in_mid + 8)}};
+        const float32x4x3_t vlow = {{vld1q_f32(in_low), vld1q_f32(in_low + 4), vld1q_f32(in_low + 8)}};
+        out.val[0]               = vmulq_f32(vtop.val[0], m0.val[0]);
+        out.val[1]               = vmulq_f32(vtop.val[1], m0.val[0]);
 
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
         out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
@@ -438,7 +400,7 @@
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
         out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
 
-        if(stridex == 3)
+        if (stridex == 3)
         {
             out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
             accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -462,65 +424,43 @@
  * @param[in] input_offset Input quantization offset.
  *
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low,
-                                              const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                              size_t dilation_x, int32_t input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4_t single_convolve_3x3_dilation(const T           *in_top,
+                                              const T           *in_mid,
+                                              const T           *in_low,
+                                              const int32x4x3_t &m0,
+                                              const int32x4x3_t &m1,
+                                              const int32x4x3_t &m2,
+                                              size_t             dilation_x,
+                                              int32_t            input_offset)
 {
     using VectorType    = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
     using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + dilation_x),
-            wrapper::vload(in_top + 2 * dilation_x)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + dilation_x),
-            wrapper::vload(in_mid + 2 * dilation_x)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + dilation_x),
-            wrapper::vload(in_low + 2 * dilation_x)
-        }
-    };
+    const VectorType vtop = {
+        {wrapper::vload(in_top), wrapper::vload(in_top + dilation_x), wrapper::vload(in_top + 2 * dilation_x)}};
+    const VectorType vmid = {
+        {wrapper::vload(in_mid), wrapper::vload(in_mid + dilation_x), wrapper::vload(in_mid + 2 * dilation_x)}};
+    const VectorType vlow = {
+        {wrapper::vload(in_low), wrapper::vload(in_low + dilation_x), wrapper::vload(in_low + 2 * dilation_x)}};
 
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
-        }
-    };
+    const int32x4x3_t vtop_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))),
+    }};
+    const int32x4x3_t vmid_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))),
+    }};
+    const int32x4x3_t vlow_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))),
+    }};
 
     int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
     out           = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
@@ -550,26 +490,29 @@
  * @param[in] input_offset Input quantization offset.
  *
  */
-template < typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) >
-inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                                         const size_t dilation_x, unsigned int stridex, int input_offset)
+template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value)>
+inline int32x4x2_t convolve_3x3_dilation(const T           *in_top,
+                                         const T           *in_mid,
+                                         const T           *in_low,
+                                         const int32x4x3_t &m0,
+                                         const int32x4x3_t &m1,
+                                         const int32x4x3_t &m2,
+                                         const size_t       dilation_x,
+                                         unsigned int       stridex,
+                                         int                input_offset)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
-    int32x4x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
+    int32x4x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)}};
 
-    if(stridex == 2)
+    if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
     }
@@ -589,10 +532,19 @@
  * @param[in]  input_offset Input quantization offset.
  *
  */
-template < bool accumulate, typename T1, typename T2, ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) >
-void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr,
-                  const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2,
-                  unsigned int stridex, int32_t input_offset)
+template <bool accumulate,
+          typename T1,
+          typename T2,
+          ARM_COMPUTE_REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value)>
+void convolve_3x3(const T1          *in_top,
+                  const T1          *in_mid,
+                  const T1          *in_low,
+                  T2                *out_ptr,
+                  const int32x4x3_t &m0,
+                  const int32x4x3_t &m1,
+                  const int32x4x3_t &m2,
+                  unsigned int       stridex,
+                  int32_t            input_offset)
 {
     ARM_COMPUTE_ERROR_ON(stridex > 3);
     using VectorType    = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
@@ -600,60 +552,30 @@
 
     const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
 
-    const VectorType vtop =
-    {
-        {
-            wrapper::vload(in_top),
-            wrapper::vload(in_top + 8)
-        }
-    };
-    const VectorType vmid =
-    {
-        {
-            wrapper::vload(in_mid),
-            wrapper::vload(in_mid + 8)
-        }
-    };
-    const VectorType vlow =
-    {
-        {
-            wrapper::vload(in_low),
-            wrapper::vload(in_low + 8)
-        }
-    };
+    const VectorType vtop = {{wrapper::vload(in_top), wrapper::vload(in_top + 8)}};
+    const VectorType vmid = {{wrapper::vload(in_mid), wrapper::vload(in_mid + 8)}};
+    const VectorType vlow = {{wrapper::vload(in_low), wrapper::vload(in_low + 8)}};
 
-    const int32x4x3_t vtop_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
-        }
-    };
-    const int32x4x3_t vmid_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
-        }
-    };
-    const int32x4x3_t vlow_s32 =
-    {
-        {
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
-            wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
-        }
-    };
+    const int32x4x3_t vtop_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))),
+    }};
+    const int32x4x3_t vmid_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))),
+    }};
+    const int32x4x3_t vlow_s32 = {{
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))),
+        wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))),
+    }};
 
-    int32x4x2_t out
-    {
-        {
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-            wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
-        }
-    };
+    int32x4x2_t out{{
+        wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+        wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
+    }};
 
     // 0
     out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
@@ -681,11 +603,11 @@
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
     out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
 
-    if(stridex == 1)
+    if (stridex == 1)
     {
         accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
     }
-    else if(stridex == 2)
+    else if (stridex == 2)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
@@ -693,7 +615,7 @@
 
         accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
         accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
@@ -712,14 +634,7 @@
     ARM_COMPUTE_UNUSED(weights_offset);
     /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
        r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
-    const float16x8x3_t r =
-    {
-        {
-            vld1q_dup_f16(ptr),
-            vld1q_dup_f16(1 + ptr),
-            vld1q_dup_f16(2 + ptr)
-        }
-    };
+    const float16x8x3_t r = {{vld1q_dup_f16(ptr), vld1q_dup_f16(1 + ptr), vld1q_dup_f16(2 + ptr)}};
     return r;
 }
 
@@ -735,35 +650,22 @@
  * @param[in] input_offset (Optional)Input quantization offset.
  *
  */
-inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                                const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                                const size_t dilation_x, int input_offset = 0)
+inline float16x8_t single_convolve_3x3_dilation(const float16_t     *in_top,
+                                                const float16_t     *in_mid,
+                                                const float16_t     *in_low,
+                                                const float16x8x3_t &m0,
+                                                const float16x8x3_t &m1,
+                                                const float16x8x3_t &m2,
+                                                const size_t         dilation_x,
+                                                int                  input_offset = 0)
 {
     ARM_COMPUTE_UNUSED(input_offset);
-    const float16x8x3_t vtop =
-    {
-        {
-            vld1q_f16(in_top),
-            vld1q_f16(in_top + dilation_x),
-            vld1q_f16(in_top + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vmid =
-    {
-        {
-            vld1q_f16(in_mid),
-            vld1q_f16(in_mid + dilation_x),
-            vld1q_f16(in_mid + 2 * dilation_x)
-        }
-    };
-    const float16x8x3_t vlow =
-    {
-        {
-            vld1q_f16(in_low),
-            vld1q_f16(in_low + dilation_x),
-            vld1q_f16(in_low + 2 * dilation_x)
-        }
-    };
+    const float16x8x3_t vtop = {
+        {vld1q_f16(in_top), vld1q_f16(in_top + dilation_x), vld1q_f16(in_top + 2 * dilation_x)}};
+    const float16x8x3_t vmid = {
+        {vld1q_f16(in_mid), vld1q_f16(in_mid + dilation_x), vld1q_f16(in_mid + 2 * dilation_x)}};
+    const float16x8x3_t vlow = {
+        {vld1q_f16(in_low), vld1q_f16(in_low + dilation_x), vld1q_f16(in_low + 2 * dilation_x)}};
     float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]);
     out             = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1]));
     out             = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2]));
@@ -792,19 +694,21 @@
  * @param[in] input_offset (Optional) Input quantization offset.
  *
  */
-inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low,
-                                           const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                                           const size_t dilation_x, unsigned int stridex, int input_offset = 0)
+inline float16x8x2_t convolve_3x3_dilation(const float16_t     *in_top,
+                                           const float16_t     *in_mid,
+                                           const float16_t     *in_low,
+                                           const float16x8x3_t &m0,
+                                           const float16x8x3_t &m1,
+                                           const float16x8x3_t &m2,
+                                           const size_t         dilation_x,
+                                           unsigned int         stridex,
+                                           int                  input_offset = 0)
 {
-    float16x8x2_t out =
-    {
-        {
-            single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
-            single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)
-        }
-    };
+    float16x8x2_t out = {
+        {single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
+         single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset)}};
 
-    if(stridex == 2)
+    if (stridex == 2)
     {
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
@@ -814,7 +718,7 @@
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
     }
-    else if(stridex == 3)
+    else if (stridex == 3)
     {
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
         out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -838,20 +742,20 @@
  *
  */
 template <bool accumulate>
-inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr,
-                         const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2,
-                         unsigned int stridex, int input_offset = 0)
+inline void convolve_3x3(const float16_t     *in_top,
+                         const float16_t     *in_mid,
+                         const float16_t     *in_low,
+                         float16_t           *out_ptr,
+                         const float16x8x3_t &m0,
+                         const float16x8x3_t &m1,
+                         const float16x8x3_t &m2,
+                         unsigned int         stridex,
+                         int                  input_offset = 0)
 {
     ARM_COMPUTE_UNUSED(input_offset);
 
-    float16x8x2_t out =
-    {
-        {
-            vdupq_n_f16(0),
-            vdupq_n_f16(0)
-        }
-    };
-    if(stridex == 2)
+    float16x8x2_t out = {{vdupq_n_f16(0), vdupq_n_f16(0)}};
+    if (stridex == 2)
     {
         const float16x8x2_t vtop     = vld2q_f16(in_top);
         const float16x8x2_t vmid     = vld2q_f16(in_mid);
@@ -877,32 +781,11 @@
     }
     else
     {
-        const float16x8x3_t vtop =
-        {
-            {
-                vld1q_f16(in_top),
-                vld1q_f16(in_top + 8),
-                vld1q_f16(in_top + 16)
-            }
-        };
-        const float16x8x3_t vmid =
-        {
-            {
-                vld1q_f16(in_mid),
-                vld1q_f16(in_mid + 8),
-                vld1q_f16(in_mid + 16)
-            }
-        };
-        const float16x8x3_t vlow =
-        {
-            {
-                vld1q_f16(in_low),
-                vld1q_f16(in_low + 8),
-                vld1q_f16(in_low + 16)
-            }
-        };
-        out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]);
-        out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]);
+        const float16x8x3_t vtop = {{vld1q_f16(in_top), vld1q_f16(in_top + 8), vld1q_f16(in_top + 16)}};
+        const float16x8x3_t vmid = {{vld1q_f16(in_mid), vld1q_f16(in_mid + 8), vld1q_f16(in_mid + 16)}};
+        const float16x8x3_t vlow = {{vld1q_f16(in_low), vld1q_f16(in_low + 8), vld1q_f16(in_low + 16)}};
+        out.val[0]               = vmulq_f16(vtop.val[0], m0.val[0]);
+        out.val[1]               = vmulq_f16(vtop.val[1], m0.val[0]);
 
         out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1]));
         out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2]));
@@ -921,7 +804,7 @@
         out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1]));
         out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2]));
 
-        if(stridex == 3)
+        if (stridex == 3)
         {
             out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
             out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
@@ -946,7 +829,7 @@
  */
 inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex)
 {
-    switch(stridex)
+    switch (stridex)
     {
         case 1:
             return num_elems_written_per_iteration;
@@ -959,6 +842,6 @@
             return 0;
     }
 }
-}
+} // namespace detail
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */

diff --git a/src/core/NEON/wrapper/intrinsics/cvt.h b/src/core/NEON/wrapper/intrinsics/cvt.h
index 1c77a9e..381de22 100644
--- a/src/core/NEON/wrapper/intrinsics/cvt.h
+++ b/src/core/NEON/wrapper/intrinsics/cvt.h

@@ -30,12 +30,11 @@
 {
 namespace wrapper
 {
-#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                   \
-    template <typename T>                                                            \
-    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type \
-    vcvt(const vtype &a)                                                             \
-    {                                                                                \
-        return prefix##_##postfix1##_##postfix2(a);                                  \
+#define VCVT_TO_F32_IMPL(ptype, vtype, prefix, postfix1, postfix2)                                        \
+    template <typename T>                                                                                 \
+    inline typename std::enable_if<std::is_same<T, float>::value, float32x4_t>::type vcvt(const vtype &a) \
+    {                                                                                                     \
+        return prefix##_##postfix1##_##postfix2(a);                                                       \
     }
 
 VCVT_TO_F32_IMPL(float32x4_t, uint32x4_t, vcvtq, f32, u32)
@@ -46,12 +45,11 @@
 #undef VCVT_TO_F32_IMPL
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                       \
-    template <typename T>                                                                \
-    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type \
-    vcvt(const vtype &a)                                                                 \
-    {                                                                                    \
-        return prefix##_##postfix1##_##postfix2(a);                                      \
+#define VCVT_TO_F16_IMPL(ptype, vtype, prefix, postfix1, postfix2)                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, float16x4_t>::type vcvt(const vtype &a) \
+    {                                                                                                         \
+        return prefix##_##postfix1##_##postfix2(a);                                                           \
     }
 
 VCVT_TO_F16_IMPL(float16x4_t, float32x4_t, vcvt, f16, f32)
@@ -59,14 +57,14 @@
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template <typename T>
-inline typename std::enable_if < std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t >::type
+inline typename std::enable_if<std::is_same<T, uint8_t>::value || std::is_same<T, uint32_t>::value, uint32x4_t>::type
 vcvt(const float32x4_t &a)
 {
     return vcvtq_u32_f32(a);
 }
 
 template <typename T>
-inline typename std::enable_if < std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t >::type
+inline typename std::enable_if<std::is_same<T, int8_t>::value || std::is_same<T, int32_t>::value, int32x4_t>::type
 vcvt(const float32x4_t &a)
 {
     return vcvtq_s32_f32(a);
@@ -74,15 +72,13 @@
 
 #ifdef __aarch64__
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type
-vcvta(const float32x4_t &a)
+inline typename std::enable_if<std::is_same<T, uint32_t>::value, uint32x4_t>::type vcvta(const float32x4_t &a)
 {
     return vcvtaq_u32_f32(a);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type
-vcvta(const float32x4_t &a)
+inline typename std::enable_if<std::is_same<T, int32_t>::value, int32x4_t>::type vcvta(const float32x4_t &a)
 {
     return vcvtaq_s32_f32(a);
 }
@@ -96,14 +92,13 @@
  */
 inline void vcvt_bf16_f32(const float *inptr, uint16_t *outptr)
 {
-    __asm __volatile(
-        "ldp    q0, q1, [%[inptr]]\n"
-        ".inst  0xea16800\n"  // BFCVTN v0, v0
-        ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
-        "str    q0, [%[outptr]]\n"
-        : [inptr] "+r"(inptr)
-        : [outptr] "r"(outptr)
-        : "v0", "v1", "memory");
+    __asm __volatile("ldp    q0, q1, [%[inptr]]\n"
+                     ".inst  0xea16800\n"  // BFCVTN v0, v0
+                     ".inst  0x4ea16820\n" // BFCVTN2 v0, v1
+                     "str    q0, [%[outptr]]\n"
+                     : [inptr] "+r"(inptr)
+                     : [outptr] "r"(outptr)
+                     : "v0", "v1", "memory");
 }
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 

diff --git a/src/core/NEON/wrapper/intrinsics/div.h b/src/core/NEON/wrapper/intrinsics/div.h
index 265f30d..ece991a 100644
--- a/src/core/NEON/wrapper/intrinsics/div.h
+++ b/src/core/NEON/wrapper/intrinsics/div.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_DIV_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute

diff --git a/src/core/NEON/wrapper/intrinsics/erf.h b/src/core/NEON/wrapper/intrinsics/erf.h
index e220764..0e34462 100644
--- a/src/core/NEON/wrapper/intrinsics/erf.h
+++ b/src/core/NEON/wrapper/intrinsics/erf.h

@@ -26,6 +26,7 @@
 #define ARM_COMPUTE_WRAPPER_ERF_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute

diff --git a/src/core/NEON/wrapper/intrinsics/exp.h b/src/core/NEON/wrapper/intrinsics/exp.h
index c2a6970..f44577b 100644
--- a/src/core/NEON/wrapper/intrinsics/exp.h
+++ b/src/core/NEON/wrapper/intrinsics/exp.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_EXP_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute

diff --git a/src/core/NEON/wrapper/intrinsics/getlane.h b/src/core/NEON/wrapper/intrinsics/getlane.h
index 2052751..ae813bb 100644
--- a/src/core/NEON/wrapper/intrinsics/getlane.h
+++ b/src/core/NEON/wrapper/intrinsics/getlane.h

@@ -33,7 +33,7 @@
 #define VGETLANE_IMPL_8(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -59,7 +59,7 @@
 #define VGETLANE_IMPL_4(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -77,7 +77,7 @@
 #define VGETLANE_IMPL_2(stype, vtype, postfix)                         \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vget_lane_##postfix(vector, 0);                 \
@@ -102,7 +102,7 @@
 #define VGETQLANE_IMPL_16(stype, vtype, postfix)                       \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -144,7 +144,7 @@
 #define VGETQLANE_IMPL_8(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -170,7 +170,7 @@
 #define VGETQLANE_IMPL_4(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \
@@ -188,7 +188,7 @@
 #define VGETQLANE_IMPL_2(stype, vtype, postfix)                        \
     inline stype vgetlane(const vtype vector, const unsigned int lane) \
     {                                                                  \
-        switch(lane)                                                   \
+        switch (lane)                                                  \
         {                                                              \
             case 0:                                                    \
                 return vgetq_lane_##postfix(vector, 0);                \

diff --git a/src/core/NEON/wrapper/intrinsics/inv.h b/src/core/NEON/wrapper/intrinsics/inv.h
index de398b0..e443be6 100644
--- a/src/core/NEON/wrapper/intrinsics/inv.h
+++ b/src/core/NEON/wrapper/intrinsics/inv.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_INV_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute

diff --git a/src/core/NEON/wrapper/intrinsics/invsqrt.h b/src/core/NEON/wrapper/intrinsics/invsqrt.h
index 2343efa..257b445 100644
--- a/src/core/NEON/wrapper/intrinsics/invsqrt.h
+++ b/src/core/NEON/wrapper/intrinsics/invsqrt.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_INVSQRT_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute

diff --git a/src/core/NEON/wrapper/intrinsics/log.h b/src/core/NEON/wrapper/intrinsics/log.h
index 357a77c..d091407 100644
--- a/src/core/NEON/wrapper/intrinsics/log.h
+++ b/src/core/NEON/wrapper/intrinsics/log.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_LOG_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute

diff --git a/src/core/NEON/wrapper/intrinsics/pow.h b/src/core/NEON/wrapper/intrinsics/pow.h
index 61f834e..dfd6ccc 100644
--- a/src/core/NEON/wrapper/intrinsics/pow.h
+++ b/src/core/NEON/wrapper/intrinsics/pow.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_POW_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute

diff --git a/src/core/NEON/wrapper/intrinsics/qmov.h b/src/core/NEON/wrapper/intrinsics/qmov.h
index 167f3cf..9a0a23a 100644
--- a/src/core/NEON/wrapper/intrinsics/qmov.h
+++ b/src/core/NEON/wrapper/intrinsics/qmov.h

@@ -31,15 +31,13 @@
 namespace wrapper
 {
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, uint8_t>::value, uint8x8_t>::type vqmov(const int16x8_t &a)
 {
     return vqmovun_s16(a);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type
-vqmov(const int16x8_t &a)
+inline typename std::enable_if<std::is_same<T, int8_t>::value, int8x8_t>::type vqmov(const int16x8_t &a)
 {
     return vqmovn_s16(a);
 }

diff --git a/src/core/NEON/wrapper/intrinsics/reinterpret.h b/src/core/NEON/wrapper/intrinsics/reinterpret.h
index cf00a4a..c2c4f72 100644
--- a/src/core/NEON/wrapper/intrinsics/reinterpret.h
+++ b/src/core/NEON/wrapper/intrinsics/reinterpret.h

@@ -35,7 +35,7 @@
     {                                                               \
         return prefix##_##postfix1##_##postfix2(a);                 \
     }                                                               \
-    \
+                                                                    \
     inline ptype vreinterpret(const ptype &a)                       \
     {                                                               \
         return a;                                                   \

diff --git a/src/core/NEON/wrapper/intrinsics/round.h b/src/core/NEON/wrapper/intrinsics/round.h
index d23feb6..7789aab 100644
--- a/src/core/NEON/wrapper/intrinsics/round.h
+++ b/src/core/NEON/wrapper/intrinsics/round.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_ROUND_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute

diff --git a/src/core/NEON/wrapper/intrinsics/setlane.h b/src/core/NEON/wrapper/intrinsics/setlane.h
index 197eeda..259b8ea 100644
--- a/src/core/NEON/wrapper/intrinsics/setlane.h
+++ b/src/core/NEON/wrapper/intrinsics/setlane.h

@@ -33,7 +33,7 @@
 #define VSETLANE_IMPL_8(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -59,7 +59,7 @@
 #define VSETLANE_IMPL_4(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -77,7 +77,7 @@
 #define VSETLANE_IMPL_2(stype, atype, vtype, postfix)                                     \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vset_lane_##postfix(value, vector, 0);                             \
@@ -102,7 +102,7 @@
 #define VSETQLANE_IMPL_16(stype, atype, vtype, postfix)                                   \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
@@ -144,7 +144,7 @@
 #define VSETQLANE_IMPL_8(stype, atype, vtype, postfix)                                    \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \
@@ -170,7 +170,7 @@
 #define VSETQLANE_IMPL_4(stype, atype, vtype, postfix)                                    \
     inline stype vsetlane(const atype value, const vtype vector, const unsigned int lane) \
     {                                                                                     \
-        switch(lane)                                                                      \
+        switch (lane)                                                                     \
         {                                                                                 \
             case 0:                                                                       \
                 return vsetq_lane_##postfix(value, vector, 0);                            \

diff --git a/src/core/NEON/wrapper/intrinsics/shr.h b/src/core/NEON/wrapper/intrinsics/shr.h
index 73ca9c5..6ccb9cd 100644
--- a/src/core/NEON/wrapper/intrinsics/shr.h
+++ b/src/core/NEON/wrapper/intrinsics/shr.h

@@ -75,7 +75,7 @@
     {                                                                                                            \
         return prefix_signed##_##postfix(a, b);                                                                  \
     }                                                                                                            \
-    \
+                                                                                                                 \
     template <int b, typename T>                                                                                 \
     inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
     vqrshrn_ex(const vtype &a)                                                                                   \
@@ -128,7 +128,7 @@
     {                                                                                                            \
         return prefix_signed##_##postfix(a, b);                                                                  \
     }                                                                                                            \
-    \
+                                                                                                                 \
     template <int b, typename T>                                                                                 \
     inline typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, u##half_vtype>::type \
     vqrshrn_ex(const vtype &a)                                                                                   \

diff --git a/src/core/NEON/wrapper/intrinsics/sin.h b/src/core/NEON/wrapper/intrinsics/sin.h
index 03c2813..d24fdfa 100644
--- a/src/core/NEON/wrapper/intrinsics/sin.h
+++ b/src/core/NEON/wrapper/intrinsics/sin.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_SIN_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -54,4 +55,4 @@
 #undef vsub_IMPL
 } // namespace wrapper
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_WRAPPER_SUB_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_WRAPPER_SUB_H */

diff --git a/src/core/NEON/wrapper/intrinsics/svcnt.h b/src/core/NEON/wrapper/intrinsics/svcnt.h
index e530e7c..c465250 100644
--- a/src/core/NEON/wrapper/intrinsics/svcnt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcnt.h

@@ -30,7 +30,7 @@
 namespace wrapper
 {
 template <size_t element_size>
-inline uint64_t  svcnt_size();
+inline uint64_t svcnt_size();
 
 template <>
 inline uint64_t svcnt_size<64>()
@@ -65,4 +65,4 @@
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCNT_H */

diff --git a/src/core/NEON/wrapper/intrinsics/svcvt.h b/src/core/NEON/wrapper/intrinsics/svcvt.h
index 746b004..00ef7b7 100644
--- a/src/core/NEON/wrapper/intrinsics/svcvt.h
+++ b/src/core/NEON/wrapper/intrinsics/svcvt.h

@@ -29,11 +29,12 @@
 {
 namespace wrapper
 {
-#define SVCVT_Z_TO_F32_IMPL(vtype)                                                                                        \
-    template <typename T>                                                                                                 \
-    inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                     \
-        return svcvt_f32_z(pg, a);                                                                                        \
+#define SVCVT_Z_TO_F32_IMPL(vtype)                                                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, float>::value, svfloat32_t>::type svcvt_z(svbool_t     pg, \
+                                                                                             const vtype &a)  \
+    {                                                                                                         \
+        return svcvt_f32_z(pg, a);                                                                            \
     }
 
 SVCVT_Z_TO_F32_IMPL(svuint32_t)
@@ -42,11 +43,12 @@
 
 #undef SVCVT_Z_TO_F32_IMPL
 
-#define SVCVT_Z_TO_F16_IMPL(vtype)                                                                                            \
-    template <typename T>                                                                                                     \
-    inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                         \
-        return svcvt_f16_z(pg, a);                                                                                            \
+#define SVCVT_Z_TO_F16_IMPL(vtype)                                                                                \
+    template <typename T>                                                                                         \
+    inline typename std::enable_if<std::is_same<T, float16_t>::value, svfloat16_t>::type svcvt_z(svbool_t     pg, \
+                                                                                                 const vtype &a)  \
+    {                                                                                                             \
+        return svcvt_f16_z(pg, a);                                                                                \
     }
 
 SVCVT_Z_TO_F16_IMPL(svuint32_t)
@@ -55,11 +57,12 @@
 
 #undef SVCVT_Z_TO_F16_IMPL
 
-#define SVCVT_Z_TO_S32_IMPL(vtype)                                                                                        \
-    template <typename T>                                                                                                 \
-    inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t pg, const vtype &a) \
-    {                                                                                                                     \
-        return svcvt_s32_z(pg, a);                                                                                        \
+#define SVCVT_Z_TO_S32_IMPL(vtype)                                                                            \
+    template <typename T>                                                                                     \
+    inline typename std::enable_if<std::is_same<T, int32_t>::value, svint32_t>::type svcvt_z(svbool_t     pg, \
+                                                                                             const vtype &a)  \
+    {                                                                                                         \
+        return svcvt_s32_z(pg, a);                                                                            \
     }
 
 SVCVT_Z_TO_S32_IMPL(svfloat16_t)
@@ -71,4 +74,4 @@
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVCVT_H */

diff --git a/src/core/NEON/wrapper/intrinsics/svexp.h b/src/core/NEON/wrapper/intrinsics/svexp.h
index d6ce9a7..1e8bce3 100644
--- a/src/core/NEON/wrapper/intrinsics/svexp.h
+++ b/src/core/NEON/wrapper/intrinsics/svexp.h

@@ -26,6 +26,7 @@
 
 #if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -46,4 +47,4 @@
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVEXP_H */

diff --git a/src/core/NEON/wrapper/intrinsics/svlog.h b/src/core/NEON/wrapper/intrinsics/svlog.h
index 5b505ae..b4630e2 100644
--- a/src/core/NEON/wrapper/intrinsics/svlog.h
+++ b/src/core/NEON/wrapper/intrinsics/svlog.h

@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H
 #if defined(__ARM_FEATURE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -44,4 +45,4 @@
 } // namespace wrapper
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVLOG_H */

diff --git a/src/core/NEON/wrapper/intrinsics/svptrue.h b/src/core/NEON/wrapper/intrinsics/svptrue.h
index 53407e5..6ed00bc 100644
--- a/src/core/NEON/wrapper/intrinsics/svptrue.h
+++ b/src/core/NEON/wrapper/intrinsics/svptrue.h

@@ -30,7 +30,7 @@
 namespace wrapper
 {
 template <size_t element_size>
-inline svbool_t  svptrue_size();
+inline svbool_t svptrue_size();
 
 template <>
 inline svbool_t svptrue_size<64>()
@@ -65,4 +65,4 @@
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVPTRUE_H */

diff --git a/src/core/NEON/wrapper/intrinsics/svwhilelt.h b/src/core/NEON/wrapper/intrinsics/svwhilelt.h
index ef58217..f0f84a9 100644
--- a/src/core/NEON/wrapper/intrinsics/svwhilelt.h
+++ b/src/core/NEON/wrapper/intrinsics/svwhilelt.h

@@ -32,7 +32,7 @@
 #define SVWHILELT_IMPL(type)                           \
     template <size_t element_size>                     \
     inline svbool_t svwhilelt_size(type a, type b);    \
-    \
+                                                       \
     template <>                                        \
     inline svbool_t svwhilelt_size<64>(type a, type b) \
     {                                                  \
@@ -70,4 +70,4 @@
 } // namespace arm_compute
 
 #endif /* defined(__ARM_FEATURE_SVE) */
-#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */
\ No newline at end of file
+#endif /* SRC_CORE_NEON_WRAPPER_INTRINSICS_SVWHILELT_H */

diff --git a/src/core/NEON/wrapper/intrinsics/tanh.h b/src/core/NEON/wrapper/intrinsics/tanh.h
index daeaf19..e74f0e8 100644
--- a/src/core/NEON/wrapper/intrinsics/tanh.h
+++ b/src/core/NEON/wrapper/intrinsics/tanh.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_WRAPPER_TANH_H
 
 #include "src/core/NEON/NEMath.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute

diff --git a/src/core/NEON/wrapper/scalar/add.h b/src/core/NEON/wrapper/scalar/add.h
index 642d926..2ec8886 100644
--- a/src/core/NEON/wrapper/scalar/add.h
+++ b/src/core/NEON/wrapper/scalar/add.h

@@ -32,22 +32,22 @@
 {
 inline uint8_t add_sat(const uint8_t &a, const uint8_t &b)
 {
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+    const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
     return vget_lane_u8(vqadd_u8(va, vb), 0);
 }
 
 inline int16_t add_sat(const int16_t &a, const int16_t &b)
 {
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
+    const int16x4_t va = {a, 0, 0, 0};
+    const int16x4_t vb = {b, 0, 0, 0};
     return vget_lane_s16(vqadd_s16(va, vb), 0);
 }
 
 inline int32_t add_sat(const int32_t &a, const int32_t &b)
 {
-    const int32x2_t va = { a, 0 };
-    const int32x2_t vb = { b, 0 };
+    const int32x2_t va = {a, 0};
+    const int32x2_t vb = {b, 0};
     return vget_lane_s32(vqadd_s32(va, vb), 0);
 }
 

diff --git a/src/core/NEON/wrapper/scalar/sub.h b/src/core/NEON/wrapper/scalar/sub.h
index 1fe51d7..00de7d8 100644
--- a/src/core/NEON/wrapper/scalar/sub.h
+++ b/src/core/NEON/wrapper/scalar/sub.h

@@ -32,22 +32,22 @@
 {
 inline uint8_t sub_sat(const uint8_t &a, const uint8_t &b)
 {
-    const uint8x8_t va = { a, 0, 0, 0, 0, 0, 0, 0 };
-    const uint8x8_t vb = { b, 0, 0, 0, 0, 0, 0, 0 };
+    const uint8x8_t va = {a, 0, 0, 0, 0, 0, 0, 0};
+    const uint8x8_t vb = {b, 0, 0, 0, 0, 0, 0, 0};
     return vget_lane_u8(vqsub_u8(va, vb), 0);
 }
 
 inline int16_t sub_sat(const int16_t &a, const int16_t &b)
 {
-    const int16x4_t va = { a, 0, 0, 0 };
-    const int16x4_t vb = { b, 0, 0, 0 };
+    const int16x4_t va = {a, 0, 0, 0};
+    const int16x4_t vb = {b, 0, 0, 0};
     return vget_lane_s16(vqsub_s16(va, vb), 0);
 }
 
 inline int32_t sub_sat(const int32_t &a, const int32_t &b)
 {
-    const int32x2_t va = { a, 0 };
-    const int32x2_t vb = { b, 0 };
+    const int32x2_t va = {a, 0};
+    const int32x2_t vb = {b, 0};
     return vget_lane_s32(vqsub_s32(va, vb), 0);
 }
 

diff --git a/src/core/NEON/wrapper/svtraits.h b/src/core/NEON/wrapper/svtraits.h
index 5ccd0ba..330d272 100644
--- a/src/core/NEON/wrapper/svtraits.h
+++ b/src/core/NEON/wrapper/svtraits.h

@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_WRAPPER_SVTRAITS_H
 #if defined(ARM_COMPUTE_ENABLE_SVE)
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute

diff --git a/src/core/Rounding.cpp b/src/core/Rounding.cpp
index 99858e2..62ce335 100644
--- a/src/core/Rounding.cpp
+++ b/src/core/Rounding.cpp

@@ -25,6 +25,7 @@
 #include "arm_compute/core/Rounding.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/ToolchainSupport.h"
 
 #include <cmath>
@@ -36,7 +37,7 @@
 {
     using namespace std;
     int rounded = 0;
-    switch(rounding_policy)
+    switch (rounding_policy)
     {
         case RoundingPolicy::TO_ZERO:
         {
@@ -51,9 +52,7 @@
         case RoundingPolicy::TO_NEAREST_EVEN:
         {
 #ifdef __aarch64__
-            asm("fcvtns %x[res], %s[value]"
-                : [res] "=r"(rounded)
-                : [value] "w"(x));
+            asm("fcvtns %x[res], %s[value]" : [res] "=r"(rounded) : [value] "w"(x));
 #else  // __aarch64__
             ARM_COMPUTE_ERROR("TO_NEAREST_EVEN rounding policy is not supported.");
 #endif // __aarch64__

diff --git a/src/core/Size2D.cpp b/src/core/Size2D.cpp
index 6eb46e5..69b2651 100644
--- a/src/core/Size2D.cpp
+++ b/src/core/Size2D.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Size2D.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -30,4 +31,4 @@
 {
     return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height);
 }
-}
+} // namespace arm_compute

diff --git a/src/core/Size3D.cpp b/src/core/Size3D.cpp
index 3ee9fb8..b56a99a 100644
--- a/src/core/Size3D.cpp
+++ b/src/core/Size3D.cpp

@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Size3D.h"
+
 #include "support/StringSupport.h"
 
 namespace arm_compute
 {
 std::string Size3D::to_string() const
 {
-    return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") + support::cpp11::to_string(depth);
+    return support::cpp11::to_string(width) + std::string("x") + support::cpp11::to_string(height) + std::string("x") +
+           support::cpp11::to_string(depth);
 }
-}
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/core/SubTensorInfo.cpp b/src/core/SubTensorInfo.cpp
index 723b6bc..8012c3d 100644
--- a/src/core/SubTensorInfo.cpp
+++ b/src/core/SubTensorInfo.cpp

@@ -42,10 +42,10 @@
 TensorShape extend_parent_shape(TensorShape parent_shape, TensorShape shape, Coordinates coords)
 {
     // Extend shape
-    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
         int dimension_extend = coords[i] + static_cast<int>(shape[i]);
-        if((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
+        if ((dimension_extend > static_cast<int>(parent_shape[i])) && (dimension_extend > 0))
         {
             parent_shape.set(i, static_cast<size_t>(dimension_extend));
         }
@@ -56,23 +56,35 @@
 } // namespace
 
 SubTensorInfo::SubTensorInfo()
-    : _parent(nullptr), _tensor_shape(), _dims_state(), _coords(), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(false), _lock_paddings(false)
+    : _parent(nullptr),
+      _tensor_shape(),
+      _dims_state(),
+      _coords(),
+      _valid_region{Coordinates(), _tensor_shape},
+      _extend_parent(false),
+      _lock_paddings(false)
 {
 }
 
 SubTensorInfo::SubTensorInfo(ITensorInfo *parent, TensorShape tensor_shape, Coordinates coords, bool extend_parent)
-    : _parent(parent), _tensor_shape(tensor_shape), _dims_state(), _coords(coords), _valid_region{ Coordinates(), _tensor_shape }, _extend_parent(extend_parent), _lock_paddings(false)
+    : _parent(parent),
+      _tensor_shape(tensor_shape),
+      _dims_state(),
+      _coords(coords),
+      _valid_region{Coordinates(), _tensor_shape},
+      _extend_parent(extend_parent),
+      _lock_paddings(false)
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
 
     // Check if subtensor is valid if parent is configured
-    if(parent->tensor_shape().total_size() != 0 && !_extend_parent)
+    if (parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(parent->tensor_shape(), coords, tensor_shape);
     }
 
     // Initialize valid region
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 }
 
 std::unique_ptr<ITensorInfo> SubTensorInfo::clone() const
@@ -91,17 +103,17 @@
     ARM_COMPUTE_ERROR_ON(_parent == nullptr);
 
     // Check if subtensor is valid if parent is configured
-    if(_parent->tensor_shape().total_size() != 0 && !_extend_parent)
+    if (_parent->tensor_shape().total_size() != 0 && !_extend_parent)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBTENSOR(_parent->tensor_shape(), _coords, shape);
-        _valid_region = ValidRegion{ _coords, shape };
+        _valid_region = ValidRegion{_coords, shape};
     }
-    else if(_extend_parent) // Extend parent shape, configure if specified
+    else if (_extend_parent) // Extend parent shape, configure if specified
     {
         ARM_COMPUTE_ERROR_ON((_parent->data_type() == DataType::UNKNOWN) && (_parent->format() == Format::UNKNOWN));
         TensorShape parent_extended_shape = extend_parent_shape(_parent->tensor_shape(), shape, _coords);
         _parent->set_tensor_shape(parent_extended_shape);
-        _parent->set_valid_region(ValidRegion{ Coordinates(), parent_extended_shape });
+        _parent->set_valid_region(ValidRegion{Coordinates(), parent_extended_shape});
     }
     _tensor_shape = shape;
     return *this;
@@ -133,11 +145,11 @@
     ARM_COMPUTE_ERROR_ON(_parent->total_size() == 0);
 
     // Check that you do not extend padding on sub-tensors unless XY shape matches parent tensor
-    if(!_extend_parent && (padding.left || padding.right))
+    if (!_extend_parent && (padding.left || padding.right))
     {
         ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().x() != tensor_shape().x());
     }
-    if(!_extend_parent && (padding.top || padding.bottom))
+    if (!_extend_parent && (padding.top || padding.bottom))
     {
         ARM_COMPUTE_ERROR_ON(_parent->tensor_shape().y() != tensor_shape().y());
     }
@@ -153,7 +165,7 @@
     int32_t        offset  = offset_first_element_in_bytes();
     const Strides &strides = strides_in_bytes();
 
-    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
     {
         offset += pos[i] * strides[i];
     }

diff --git a/src/core/TensorInfo.cpp b/src/core/TensorInfo.cpp
index 5905ba5..31bddbd 100644
--- a/src/core/TensorInfo.cpp
+++ b/src/core/TensorInfo.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/Utils.h"
 
 #include <memory>
@@ -34,13 +35,26 @@
 namespace arm_compute
 {
 TensorInfo::TensorInfo()
-    : _total_size(0), _offset_first_element_in_bytes(0), _strides_in_bytes(), _num_channels(0), _tensor_shape(), _dims_state(), _data_type(DataType::UNKNOWN), _format(Format::UNKNOWN), _is_resizable{ true },
-      _valid_region{ Coordinates(), _tensor_shape }, _padding{ 0 }, _quantization_info(), _data_layout(DataLayout::NCHW), _are_values_constant(true), _id(invalid_tensor_id), _lock_paddings(false)
+    : _total_size(0),
+      _offset_first_element_in_bytes(0),
+      _strides_in_bytes(),
+      _num_channels(0),
+      _tensor_shape(),
+      _dims_state(),
+      _data_type(DataType::UNKNOWN),
+      _format(Format::UNKNOWN),
+      _is_resizable{true},
+      _valid_region{Coordinates(), _tensor_shape},
+      _padding{0},
+      _quantization_info(),
+      _data_layout(DataLayout::NCHW),
+      _are_values_constant(true),
+      _id(invalid_tensor_id),
+      _lock_paddings(false)
 {
 }
 
-TensorInfo::TensorInfo(const ITensorInfo &info)
-    : TensorInfo()
+TensorInfo::TensorInfo(const ITensorInfo &info) : TensorInfo()
 {
     _total_size                    = info.total_size();
     _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
@@ -60,8 +74,7 @@
     _lock_paddings                 = info.lock_paddings();
 }
 
-TensorInfo::TensorInfo(const TensorInfo &info)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorInfo &info) : TensorInfo()
 {
     _total_size                    = info.total_size();
     _offset_first_element_in_bytes = info.offset_first_element_in_bytes();
@@ -80,8 +93,7 @@
     _id                            = info.id();
     _lock_paddings                 = false;
 }
-TensorInfo::TensorInfo(Format format)
-    : TensorInfo(TensorShape(), format)
+TensorInfo::TensorInfo(Format format) : TensorInfo(TensorShape(), format)
 {
 }
 
@@ -90,25 +102,25 @@
 {
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, Format format) : TensorInfo()
 {
     init(tensor_shape, format);
 }
 
-TensorInfo::TensorInfo(size_t num_channels, DataType data_type)
-    : TensorInfo()
+TensorInfo::TensorInfo(size_t num_channels, DataType data_type) : TensorInfo()
 {
     init(TensorShape(), num_channels, data_type);
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type)
-    : TensorInfo()
+TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type) : TensorInfo()
 {
     init(tensor_shape, num_channels, data_type);
 }
 
-TensorInfo::TensorInfo(const TensorShape &tensor_shape, size_t num_channels, DataType data_type, QuantizationInfo quantization_info)
+TensorInfo::TensorInfo(const TensorShape &tensor_shape,
+                       size_t             num_channels,
+                       DataType           data_type,
+                       QuantizationInfo   quantization_info)
     : TensorInfo()
 {
     init(tensor_shape, num_channels, data_type);
@@ -137,9 +149,11 @@
     _format = format;
 }
 
-void TensorInfo::init(const TensorShape &tensor_shape, Format format,
-                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-                      size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+                      Format             format,
+                      const Strides     &strides_in_bytes,
+                      size_t             offset_first_element_in_bytes,
+                      size_t             total_size_in_bytes)
 {
     size_t         num_channels = num_channels_from_format(format);
     const DataType type         = data_type_from_format(format);
@@ -165,9 +179,12 @@
     set_tensor_shape(tensor_shape);
 }
 
-void TensorInfo::init(const TensorShape &tensor_shape, size_t num_channels, DataType data_type,
-                      const Strides &strides_in_bytes, size_t offset_first_element_in_bytes,
-                      size_t total_size_in_bytes)
+void TensorInfo::init(const TensorShape &tensor_shape,
+                      size_t             num_channels,
+                      DataType           data_type,
+                      const Strides     &strides_in_bytes,
+                      size_t             offset_first_element_in_bytes,
+                      size_t             total_size_in_bytes)
 {
     ARM_COMPUTE_ERROR_ON(num_channels == 0);
 
@@ -179,7 +196,7 @@
     _strides_in_bytes              = strides_in_bytes;
     _total_size                    = total_size_in_bytes;
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 }
 
 size_t TensorInfo::init_auto_padding(const TensorShape &tensor_shape, Format format)
@@ -202,7 +219,7 @@
     _format       = Format::UNKNOWN;
     _tensor_shape = tensor_shape;
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
 
     auto_padding();
 
@@ -233,11 +250,11 @@
     size_t       required_total_size           = 0;
     const size_t required_offset_first_element = padding.left * stride_x + padding.top * stride_y;
 
-    switch(_tensor_shape.num_dimensions())
+    switch (_tensor_shape.num_dimensions())
     {
         case 0:
         {
-            if(_tensor_shape.total_size() > 0)
+            if (_tensor_shape.total_size() > 0)
             {
                 required_strides    = Strides(stride_x, stride_x);
                 required_total_size = stride_z;
@@ -258,7 +275,8 @@
 
             const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
 
-            required_total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
+            required_total_size =
+                static_cast<size_t>(_tensor_shape[idx_last_dimension]) * required_strides[idx_last_dimension];
             break;
         }
     }
@@ -284,25 +302,25 @@
 
     bool updated = false;
 
-    if(padding.top > _padding.top)
+    if (padding.top > _padding.top)
     {
         _padding.top = padding.top;
         updated      = true;
     }
 
-    if(padding.right > _padding.right)
+    if (padding.right > _padding.right)
     {
         _padding.right = padding.right;
         updated        = true;
     }
 
-    if(padding.bottom > _padding.bottom)
+    if (padding.bottom > _padding.bottom)
     {
         _padding.bottom = padding.bottom;
         updated         = true;
     }
 
-    if(padding.left > _padding.left)
+    if (padding.left > _padding.left)
     {
         _padding.left = padding.left;
         updated       = true;
@@ -336,7 +354,7 @@
 {
     _format = format;
 
-    if(_data_type == DataType::UNKNOWN)
+    if (_data_type == DataType::UNKNOWN)
     {
         _num_channels = num_channels_from_format(format);
         _data_type    = data_type_from_format(format);
@@ -355,19 +373,19 @@
     _offset_first_element_in_bytes = 0;
     _strides_in_bytes              = compute_strides(*this);
 
-    if(_tensor_shape.num_dimensions() == 0)
+    if (_tensor_shape.num_dimensions() == 0)
     {
         _total_size = _strides_in_bytes[0];
     }
     else
     {
         const unsigned int idx_last_dimension = _tensor_shape.num_dimensions() - 1;
-        _total_size                           = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
+        _total_size = static_cast<size_t>(_tensor_shape[idx_last_dimension]) * _strides_in_bytes[idx_last_dimension];
     }
 
     std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
 
-    _valid_region = ValidRegion{ Coordinates(), _tensor_shape };
+    _valid_region = ValidRegion{Coordinates(), _tensor_shape};
     return *this;
 }
 
@@ -392,9 +410,10 @@
 ITensorInfo &TensorInfo::reset_padding()
 {
     _padding = PaddingSize();
-    if(((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
+    if (((_format != Format::UNKNOWN) || (_data_type != DataType::UNKNOWN)) && _total_size != 0)
     {
-        std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) = calculate_padding_requirements(_padding);
+        std::tie(_strides_in_bytes, _offset_first_element_in_bytes, _total_size) =
+            calculate_padding_requirements(_padding);
     }
     return *this;
 }
@@ -405,7 +424,7 @@
 
     int32_t offset = _offset_first_element_in_bytes;
 
-    for(size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
+    for (size_t i = 0; i < _tensor_shape.num_dimensions(); ++i)
     {
         offset += pos[i] * _strides_in_bytes[i];
     }

diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp
index 1ca7adb..90a7ac3 100644
--- a/src/core/Utils.cpp
+++ b/src/core/Utils.cpp

@@ -49,7 +49,7 @@
         fs.exceptions(std::ifstream::failbit | std::ifstream::badbit);
         std::ios_base::openmode mode = std::ios::in;
 
-        if(binary)
+        if (binary)
         {
             mode |= std::ios::binary;
         }
@@ -66,7 +66,7 @@
         out.assign(std::istreambuf_iterator<char>(fs), std::istreambuf_iterator<char>());
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::ifstream::failure &e)
+    catch (const std::ifstream::failure &e)
     {
         ARM_COMPUTE_ERROR_VAR("Accessing %s: %s", filename.c_str(), e.what());
     }
@@ -77,32 +77,28 @@
 
 const std::string &string_from_channel(Channel channel)
 {
-    static std::map<Channel, const std::string> channels_map =
-    {
-        { Channel::UNKNOWN, "UNKNOWN" },
-        { Channel::R, "R" },
-        { Channel::G, "G" },
-        { Channel::B, "B" },
-        { Channel::A, "A" },
-        { Channel::Y, "Y" },
-        { Channel::U, "U" },
-        { Channel::V, "V" },
-        { Channel::C0, "C0" },
-        { Channel::C1, "C1" },
-        { Channel::C2, "C2" },
-        { Channel::C3, "C3" }
-    };
+    static std::map<Channel, const std::string> channels_map = {{Channel::UNKNOWN, "UNKNOWN"},
+                                                                {Channel::R, "R"},
+                                                                {Channel::G, "G"},
+                                                                {Channel::B, "B"},
+                                                                {Channel::A, "A"},
+                                                                {Channel::Y, "Y"},
+                                                                {Channel::U, "U"},
+                                                                {Channel::V, "V"},
+                                                                {Channel::C0, "C0"},
+                                                                {Channel::C1, "C1"},
+                                                                {Channel::C2, "C2"},
+                                                                {Channel::C3, "C3"}};
 
     return channels_map[channel];
 }
 
 const std::string &string_from_border_mode(BorderMode border_mode)
 {
-    static std::map<BorderMode, const std::string> border_mode_map =
-    {
-        { BorderMode::UNDEFINED, "UNDEFINED" },
-        { BorderMode::CONSTANT, "CONSTANT" },
-        { BorderMode::REPLICATE, "REPLICATE" },
+    static std::map<BorderMode, const std::string> border_mode_map = {
+        {BorderMode::UNDEFINED, "UNDEFINED"},
+        {BorderMode::CONSTANT, "CONSTANT"},
+        {BorderMode::REPLICATE, "REPLICATE"},
     };
 
     return border_mode_map[border_mode];
@@ -110,11 +106,10 @@
 
 const std::string &string_from_norm_type(NormType type)
 {
-    static std::map<NormType, const std::string> norm_type_map =
-    {
-        { NormType::IN_MAP_1D, "IN_MAP_1D" },
-        { NormType::IN_MAP_2D, "IN_MAP_2D" },
-        { NormType::CROSS_MAP, "CROSS_MAP" },
+    static std::map<NormType, const std::string> norm_type_map = {
+        {NormType::IN_MAP_1D, "IN_MAP_1D"},
+        {NormType::IN_MAP_2D, "IN_MAP_2D"},
+        {NormType::CROSS_MAP, "CROSS_MAP"},
     };
 
     return norm_type_map[type];
@@ -122,11 +117,10 @@
 
 const std::string &string_from_pooling_type(PoolingType type)
 {
-    static std::map<PoolingType, const std::string> pool_type_map =
-    {
-        { PoolingType::MAX, "MAX" },
-        { PoolingType::AVG, "AVG" },
-        { PoolingType::L2, "L2" },
+    static std::map<PoolingType, const std::string> pool_type_map = {
+        {PoolingType::MAX, "MAX"},
+        {PoolingType::AVG, "AVG"},
+        {PoolingType::L2, "L2"},
     };
 
     return pool_type_map[type];
@@ -134,38 +128,36 @@
 
 bool is_pool_region_entirely_outside_input(const PoolingLayerInfo &info)
 {
-    if(info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
+    if (info.is_global_pooling || info.exclude_padding || info.pool_size.x() == 0 || info.pool_size.y() == 0)
     {
         return false;
     }
     const auto ps                = info.pad_stride_info;
-    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.pad_left(), ps.pad_right() });
-    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.pad_top(), ps.pad_bottom() });
+    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.pad_left(), ps.pad_right()});
+    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.pad_top(), ps.pad_bottom()});
     return pool_le_padding_x || pool_le_padding_y;
 }
 
 bool is_pool_3d_region_entirely_outside_input(const Pooling3dLayerInfo &info)
 {
-    if(info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0)
+    if (info.is_global_pooling || info.pool_size.x() == 0 || info.pool_size.y() == 0 || info.pool_size.z() == 0)
     {
         return false;
     }
     const auto ps                = info.padding;
-    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ ps.left, ps.right });
-    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ ps.top, ps.bottom });
-    const auto pool_le_padding_z = info.pool_size.z() <= std::max({ ps.front, ps.back });
+    const auto pool_le_padding_x = info.pool_size.x() <= std::max({ps.left, ps.right});
+    const auto pool_le_padding_y = info.pool_size.y() <= std::max({ps.top, ps.bottom});
+    const auto pool_le_padding_z = info.pool_size.z() <= std::max({ps.front, ps.back});
     return pool_le_padding_x || pool_le_padding_y || pool_le_padding_z;
 }
 
 const std::string &string_from_gemmlowp_output_stage(GEMMLowpOutputStageType output_stage)
 {
-    static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map =
-    {
-        { GEMMLowpOutputStageType::NONE, "" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint" },
-        { GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float" }
-    };
+    static std::map<GEMMLowpOutputStageType, const std::string> output_stage_map = {
+        {GEMMLowpOutputStageType::NONE, ""},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN, "quantize_down"},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT, "quantize_down_fixedpoint"},
+        {GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT, "quantize_down_float"}};
 
     return output_stage_map[output_stage];
 }
@@ -175,7 +167,7 @@
     std::stringstream ss;
     std::string       converted_string;
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -223,11 +215,16 @@
     return converted_string;
 }
 
-PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout, const Size2D &dilation,
+PadStrideInfo calculate_same_pad(TensorShape                  input_shape,
+                                 TensorShape                  weights_shape,
+                                 PadStrideInfo                conv_info,
+                                 DataLayout                   data_layout,
+                                 const Size2D                &dilation,
                                  const DimensionRoundingType &rounding_type)
 {
     const auto &strides = conv_info.stride();
-    ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1), "Stride values should be greater than or equal to 1.");
+    ARM_COMPUTE_ERROR_ON_MSG((strides.first < 1 || strides.second < 1),
+                             "Stride values should be greater than or equal to 1.");
 
     const unsigned int width_idx     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx    = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -246,8 +243,9 @@
     const int real_weight_height = (kernel_height - 1) * dilation.y() + 1;
 
     // Calculate total pad
-    const int pad_width  = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
-    const int pad_height = std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
+    const int pad_width = std::max(0, static_cast<int>((out_width - 1) * strides.first + real_weight_width - in_width));
+    const int pad_height =
+        std::max(0, static_cast<int>((out_height - 1) * strides.second + real_weight_height - in_height));
 
     // Calculate individual paddings
     const unsigned int pad_left   = pad_width / 2;
@@ -265,8 +263,10 @@
     return same_info;
 }
 
-std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int in_width, unsigned int in_height,
-                                                                      unsigned int kernel_width, unsigned int kernel_height,
+std::pair<unsigned int, unsigned int> deconvolution_output_dimensions(unsigned int         in_width,
+                                                                      unsigned int         in_height,
+                                                                      unsigned int         kernel_width,
+                                                                      unsigned int         kernel_height,
                                                                       const PadStrideInfo &pad_stride_info)
 {
     const unsigned int pad_left   = pad_stride_info.pad_left();
@@ -285,8 +285,10 @@
     return std::make_pair<unsigned int, unsigned int>(w, h);
 }
 
-std::pair<unsigned int, unsigned int> scaled_dimensions(int width, int height,
-                                                        int kernel_width, int kernel_height,
+std::pair<unsigned int, unsigned int> scaled_dimensions(int                  width,
+                                                        int                  height,
+                                                        int                  kernel_width,
+                                                        int                  kernel_height,
                                                         const PadStrideInfo &pad_stride_info,
                                                         const Size2D        &dilation)
 {
@@ -300,15 +302,25 @@
     const int stride_y   = pad_stride_info.stride().second;
     int       w          = 0;
     int       h          = 0;
-    switch(pad_stride_info.round())
+    switch (pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+            w = static_cast<int>(std::floor(
+                (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+                1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+                            stride_y) +
+                           1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) / stride_y) + 1));
+            w = static_cast<int>(std::ceil(
+                (static_cast<float>(width + pad_left + pad_right - (dilation_x * (kernel_width - 1) + 1)) / stride_x) +
+                1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - (dilation_y * (kernel_height - 1) + 1)) /
+                           stride_y) +
+                          1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -319,9 +331,8 @@
     return std::make_pair(static_cast<unsigned int>(w), static_cast<unsigned int>(h));
 }
 
-std::pair<int, int> scaled_dimensions_signed(int width, int height,
-                                             int kernel_width, int kernel_height,
-                                             const PadStrideInfo &pad_stride_info)
+std::pair<int, int> scaled_dimensions_signed(
+    int width, int height, int kernel_width, int kernel_height, const PadStrideInfo &pad_stride_info)
 {
     const int pad_left   = pad_stride_info.pad_left();
     const int pad_top    = pad_stride_info.pad_top();
@@ -331,15 +342,19 @@
     const int stride_y   = pad_stride_info.stride().second;
     int       w          = 0;
     int       h          = 0;
-    switch(pad_stride_info.round())
+    switch (pad_stride_info.round())
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<int>(
+                std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            w = static_cast<int>(
+                std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -348,8 +363,12 @@
     return std::make_pair(static_cast<int>(w), static_cast<int>(h));
 }
 
-std::tuple<int, int, int> scaled_3d_dimensions_signed(int width, int height, int depth,
-                                                      int kernel_width, int kernel_height, int kernel_depth,
+std::tuple<int, int, int> scaled_3d_dimensions_signed(int                       width,
+                                                      int                       height,
+                                                      int                       depth,
+                                                      int                       kernel_width,
+                                                      int                       kernel_height,
+                                                      int                       kernel_depth,
                                                       const Pooling3dLayerInfo &pool3d_info)
 {
     const int pad_left   = pool3d_info.padding.left;
@@ -365,17 +384,23 @@
     int       h          = 0;
     int       d          = 0;
 
-    switch(pool3d_info.round_type)
+    switch (pool3d_info.round_type)
     {
         case DimensionRoundingType::FLOOR:
-            w = static_cast<int>(std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
-            d = static_cast<int>(std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+            w = static_cast<int>(
+                std::floor((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::floor((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            d = static_cast<int>(
+                std::floor((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
             break;
         case DimensionRoundingType::CEIL:
-            w = static_cast<int>(std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
-            h = static_cast<int>(std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
-            d = static_cast<int>(std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
+            w = static_cast<int>(
+                std::ceil((static_cast<float>(width + pad_left + pad_right - kernel_width) / stride_x) + 1));
+            h = static_cast<int>(
+                std::ceil((static_cast<float>(height + pad_top + pad_bottom - kernel_height) / stride_y) + 1));
+            d = static_cast<int>(
+                std::ceil((static_cast<float>(depth + pad_front + pad_back - kernel_depth) / stride_z) + 1));
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported rounding type");
@@ -400,9 +425,9 @@
     // * Softmax with QASYMM8_SIGNED: scale = 1/256, offset = -128
     // * LogSoftmax with QASYMM8: scale = 1/256, offset = 0
     // * LogSoftmax with QASYMM8_SIGNED: scale = 16/256, offset = 127
-    if(is_data_type_quantized_asymmetric_signed(input_type))
+    if (is_data_type_quantized_asymmetric_signed(input_type))
     {
-        if(is_log)
+        if (is_log)
         {
             return QuantizationInfo(16.f / 256, 127);
         }
@@ -414,17 +439,21 @@
     return QuantizationInfo(1.f / 256, 0);
 }
 
-std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info, DataType data_type, UniformQuantizationInfo oq_info)
+std::pair<int32_t, int32_t> get_quantized_activation_min_max(const ActivationLayerInfo &act_info,
+                                                             DataType                   data_type,
+                                                             UniformQuantizationInfo    oq_info)
 {
     const bool is_qasymm8_signed = is_data_type_quantized_asymmetric_signed(data_type);
     const auto a                 = act_info.a();
     const auto b                 = act_info.b();
-    const int  a_int             = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
-    const int  b_int             = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
-    const auto type_max_value    = std::get<1>(get_min_max(data_type)).get<int32_t>();
+    const int  a_int          = is_qasymm8_signed ? quantize_qasymm8_signed(a, oq_info) : quantize_qasymm8(a, oq_info);
+    const int  b_int          = is_qasymm8_signed ? quantize_qasymm8_signed(b, oq_info) : quantize_qasymm8(b, oq_info);
+    const auto type_max_value = std::get<1>(get_min_max(data_type)).get<int32_t>();
 
-    const int32_t min_activation = act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
-    const int32_t max_activation = act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
+    const int32_t min_activation =
+        act_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU ? oq_info.offset : b_int;
+    const int32_t max_activation =
+        act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ? type_max_value : a_int;
 
     return std::make_pair(min_activation, max_activation);
 }
@@ -433,11 +462,11 @@
 {
     std::unordered_map<const ITensorInfo *, PaddingSize> res;
 
-    for(const ITensor *tensor : tensors)
+    for (const ITensor *tensor : tensors)
     {
-        if(tensor)
+        if (tensor)
         {
-            res.insert({ tensor->info(), tensor->info()->padding() });
+            res.insert({tensor->info(), tensor->info()->padding()});
         }
     }
 
@@ -448,11 +477,11 @@
 {
     std::unordered_map<const ITensorInfo *, PaddingSize> res;
 
-    for(const ITensorInfo *info : infos)
+    for (const ITensorInfo *info : infos)
     {
-        if(info)
+        if (info)
         {
-            res.insert({ info, info->padding() });
+            res.insert({info, info->padding()});
         }
     }
 
@@ -461,17 +490,20 @@
 
 bool has_padding_changed(const std::unordered_map<const ITensorInfo *, PaddingSize> &padding_map)
 {
-    return std::find_if(padding_map.begin(), padding_map.end(), [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
-    {
-        return (padding_info.first->padding() != padding_info.second);
-    })
-    != padding_map.end();
+    return std::find_if(padding_map.begin(), padding_map.end(),
+                        [](const std::pair<const ITensorInfo *, PaddingSize> &padding_info)
+                        { return (padding_info.first->padding() != padding_info.second); }) != padding_map.end();
 }
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-void print_consecutive_elements(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n, int stream_width, const std::string &element_delim)
+void print_consecutive_elements(std::ostream      &s,
+                                DataType           dt,
+                                const uint8_t     *ptr,
+                                unsigned int       n,
+                                int                stream_width,
+                                const std::string &element_delim)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:
@@ -481,36 +513,46 @@
         case DataType::QSYMM8:
         case DataType::QASYMM8_SIGNED:
         case DataType::QSYMM8_PER_CHANNEL:
-            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int8_t>(s, reinterpret_cast<const int8_t *>(ptr), n, stream_width,
+                                                    element_delim);
             break;
         case DataType::U16:
         case DataType::QASYMM16:
-            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint16_t>(s, reinterpret_cast<const uint16_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S16:
         case DataType::QSYMM16:
-            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int16_t>(s, reinterpret_cast<const int16_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::U32:
-            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint32_t>(s, reinterpret_cast<const uint32_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S32:
-            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int32_t>(s, reinterpret_cast<const int32_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::U64:
-            print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<uint64_t>(s, reinterpret_cast<const uint64_t *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::S64:
-            print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<int64_t>(s, reinterpret_cast<const int64_t *>(ptr), n, stream_width,
+                                                     element_delim);
             break;
         case DataType::BFLOAT16:
-            print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<bfloat16>(s, reinterpret_cast<const bfloat16 *>(ptr), n, stream_width,
+                                                      element_delim);
             break;
         case DataType::F16:
-            print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<half>(s, reinterpret_cast<const half *>(ptr), n, stream_width,
+                                                  element_delim);
             break;
         case DataType::F32:
-            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width, element_delim);
+            print_consecutive_elements_impl<float>(s, reinterpret_cast<const float *>(ptr), n, stream_width,
+                                                   element_delim);
             break;
         default:
             ARM_COMPUTE_ERROR("Undefined element size for given data type");
@@ -519,7 +561,7 @@
 
 int max_consecutive_elements_display_width(std::ostream &s, DataType dt, const uint8_t *ptr, unsigned int n)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::U8:
         case DataType::QASYMM8:

diff --git a/src/core/Validate.cpp b/src/core/Validate.cpp
index 5a6486e..d8f7961 100644
--- a/src/core/Validate.cpp
+++ b/src/core/Validate.cpp

@@ -23,13 +23,16 @@
  */
 #include "arm_compute/core/Validate.h"
 
-arm_compute::Status arm_compute::error_on_mismatching_windows(const char *function, const char *file, const int line,
-                                                              const arm_compute::Window &full, const arm_compute::Window &win)
+arm_compute::Status arm_compute::error_on_mismatching_windows(const char                *function,
+                                                              const char                *file,
+                                                              const int                  line,
+                                                              const arm_compute::Window &full,
+                                                              const arm_compute::Window &win)
 {
     full.validate();
     win.validate();
 
-    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() != win[i].start(), function, file, line);
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() != win[i].end(), function, file, line);
@@ -38,13 +41,16 @@
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subwindow(const char *function, const char *file, const int line,
-                                                            const arm_compute::Window &full, const arm_compute::Window &sub)
+arm_compute::Status arm_compute::error_on_invalid_subwindow(const char                *function,
+                                                            const char                *file,
+                                                            const int                  line,
+                                                            const arm_compute::Window &full,
+                                                            const arm_compute::Window &sub)
 {
     full.validate();
     sub.validate();
 
-    for(size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 0; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].start() > sub[i].start(), function, file, line);
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(full[i].end() < sub[i].end(), function, file, line);
@@ -54,8 +60,12 @@
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char *function, const char *file, const int line,
-                                                                              const arm_compute::Window &full, const arm_compute::Window &window, const int dim)
+arm_compute::Status arm_compute::error_on_window_not_collapsable_at_dimension(const char                *function,
+                                                                              const char                *file,
+                                                                              const int                  line,
+                                                                              const arm_compute::Window &full,
+                                                                              const arm_compute::Window &window,
+                                                                              const int                  dim)
 {
     full.validate();
     window.validate();
@@ -67,65 +77,73 @@
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(const char *function, const char *file, const int line,
-                                                                     const arm_compute::Coordinates &pos, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_coordinates_dimensions_gte(
+    const char *function, const char *file, const int line, const arm_compute::Coordinates &pos, unsigned int max_dim)
 {
-    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC(pos[i] != 0, function, file, line);
     }
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_window_dimensions_gte(const char *function, const char *file, const int line,
-                                                                const arm_compute::Window &win, unsigned int max_dim)
+arm_compute::Status arm_compute::error_on_window_dimensions_gte(
+    const char *function, const char *file, const int line, const arm_compute::Window &win, unsigned int max_dim)
 {
-    for(unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
+    for (unsigned int i = max_dim; i < arm_compute::Coordinates::num_max_dimensions; ++i)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR((win[i].start() != 0) || (win[i].end() != win[i].step()),
-                                                function, file, line,
-                                                "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(
+            (win[i].start() != 0) || (win[i].end() != win[i].step()), function, file, line,
+            "Maximum number of dimensions expected %u but dimension %u is not empty", max_dim, i);
     }
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char                 *function,
+                                                        const char                 *file,
+                                                        const int                   line,
                                                         const arm_compute::ITensor *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor->info() == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2,
-                                            function, file, line,
-                                            "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->info()->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->info()->num_dimensions() != 2, function, file, line,
+                                            "Only 2D Tensors are supported by this kernel (%zu passed)",
+                                            tensor->info()->num_dimensions());
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_tensor_not_2d(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_tensor_not_2d(const char                     *function,
+                                                        const char                     *file,
+                                                        const int                       line,
                                                         const arm_compute::ITensorInfo *tensor)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(tensor == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2,
-                                            function, file, line,
-                                            "Only 2D Tensors are supported by this kernel (%zu passed)", tensor->num_dimensions());
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG_VAR(tensor->num_dimensions() != 2, function, file, line,
+                                            "Only 2D Tensors are supported by this kernel (%zu passed)",
+                                            tensor->num_dimensions());
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_channel_not_in_known_format(const char *function, const char *file, const int line,
-                                                                      arm_compute::Format fmt, arm_compute::Channel cn)
+arm_compute::Status arm_compute::error_on_channel_not_in_known_format(
+    const char *function, const char *file, const int line, arm_compute::Format fmt, arm_compute::Channel cn)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(fmt == arm_compute::Format::UNKNOWN, function, file, line);
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(cn == arm_compute::Channel::UNKNOWN, function, file, line);
 
-    switch(fmt)
+    switch (fmt)
     {
         case arm_compute::Format::RGB888:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+                                                 arm_compute::Channel::G, arm_compute::Channel::B);
             break;
         case arm_compute::Format::RGBA8888:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R, arm_compute::Channel::G, arm_compute::Channel::B, arm_compute::Channel::A);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::R,
+                                                 arm_compute::Channel::G, arm_compute::Channel::B,
+                                                 arm_compute::Channel::A);
             break;
         case arm_compute::Format::UV88:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U, arm_compute::Channel::V);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::U,
+                                                 arm_compute::Channel::V);
             break;
         case arm_compute::Format::IYUV:
         case arm_compute::Format::UYVY422:
@@ -133,7 +151,8 @@
         case arm_compute::Format::NV12:
         case arm_compute::Format::NV21:
         case arm_compute::Format::YUV444:
-            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y, arm_compute::Channel::U, arm_compute::Channel::V);
+            arm_compute::error_on_channel_not_in(function, file, line, cn, arm_compute::Channel::Y,
+                                                 arm_compute::Channel::U, arm_compute::Channel::V);
             break;
         default:
             ARM_COMPUTE_ERROR_LOC(function, file, line, "Not supported format.");
@@ -141,21 +160,26 @@
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char *function, const char *file, const int line,
+arm_compute::Status arm_compute::error_on_unconfigured_kernel(const char                 *function,
+                                                              const char                 *file,
+                                                              const int                   line,
                                                               const arm_compute::IKernel *kernel)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_LOC(kernel == nullptr, function, file, line);
-    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(),
-                                        function, file, line,
+    ARM_COMPUTE_RETURN_ERROR_ON_LOC_MSG(!kernel->is_window_configured(), function, file, line,
                                         "This kernel hasn't been configured.");
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subtensor(const char *function, const char *file, const int line,
-                                                            const TensorShape &parent_shape, const Coordinates &coords, const TensorShape &shape)
+arm_compute::Status arm_compute::error_on_invalid_subtensor(const char        *function,
+                                                            const char        *file,
+                                                            const int          line,
+                                                            const TensorShape &parent_shape,
+                                                            const Coordinates &coords,
+                                                            const TensorShape &shape)
 {
     // Check dimensions
-    for(unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
+    for (unsigned int i = 0; i < TensorShape::num_max_dimensions; ++i)
     {
         const bool invalid_idx        = coords[i] >= static_cast<int>(parent_shape[i]);
         const bool out_of_bounds_size = coords[i] + static_cast<int>(shape[i]) > static_cast<int>(parent_shape[i]);
@@ -164,15 +188,20 @@
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char *function, const char *file, const int line,
-                                                                         const ValidRegion &parent_valid_region, const ValidRegion &valid_region)
+arm_compute::Status arm_compute::error_on_invalid_subtensor_valid_region(const char        *function,
+                                                                         const char        *file,
+                                                                         const int          line,
+                                                                         const ValidRegion &parent_valid_region,
+                                                                         const ValidRegion &valid_region)
 {
     // Check valid regions
-    for(unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
+    for (unsigned int d = 0; d < TensorShape::num_max_dimensions; ++d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] > valid_region.anchor[d]), function, file, line);
-        ARM_COMPUTE_RETURN_ERROR_ON_LOC((parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) < (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
-                                        function, file, line);
+        ARM_COMPUTE_RETURN_ERROR_ON_LOC(
+            (parent_valid_region.anchor[d] + static_cast<int>(parent_valid_region.shape[d])) <
+                (valid_region.anchor[d] + static_cast<int>(valid_region.shape[d])),
+            function, file, line);
     }
 
     return arm_compute::Status{};

diff --git a/src/core/common/Macros.h b/src/core/common/Macros.h
index d791154..bc0ea29 100644
--- a/src/core/common/Macros.h
+++ b/src/core/common/Macros.h

@@ -25,9 +25,9 @@
 #define ARM_COMPUTE_COMMON_MACROS_H
 
 #define ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(TypeName) \
-    TypeName(const TypeName &) = delete;               \
+    TypeName(const TypeName &)            = delete;    \
     TypeName &operator=(const TypeName &) = delete;    \
     TypeName(TypeName &&)                 = default;   \
-    TypeName &operator=(TypeName &&) = default
+    TypeName &operator=(TypeName &&)      = default
 
 #endif /* ARM_COMPUTE_COMMON_MACROS_H */

diff --git a/src/core/common/Registrars.h b/src/core/common/Registrars.h
index d6dc344..686304b 100644
--- a/src/core/common/Registrars.h
+++ b/src/core/common/Registrars.h

@@ -46,7 +46,7 @@
 
 #else /* !defined(ENABLE_FP16_KERNELS) */
 #define REGISTER_FP16_NEON(func_name) nullptr
-#define REGISTER_FP16_SVE(func_name) nullptr
+#define REGISTER_FP16_SVE(func_name)  nullptr
 #define REGISTER_FP16_SVE2(func_name) nullptr
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 
@@ -72,7 +72,7 @@
 
 #else /* defined(ENABLE_FP32_KERNELS) */
 #define REGISTER_FP32_NEON(func_name) nullptr
-#define REGISTER_FP32_SVE(func_name) nullptr
+#define REGISTER_FP32_SVE(func_name)  nullptr
 #define REGISTER_FP32_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_FP32_KERNELS) */
 
@@ -94,7 +94,7 @@
 
 #else /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 #define REGISTER_QASYMM8_SIGNED_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SIGNED_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SIGNED_SVE(func_name)  nullptr
 #define REGISTER_QASYMM8_SIGNED_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_SIGNED_KERNELS) */
 
@@ -115,7 +115,7 @@
 
 #else /* defined(ENABLE_QASYMM8_KERNELS) */
 #define REGISTER_QASYMM8_NEON(func_name) nullptr
-#define REGISTER_QASYMM8_SVE(func_name) nullptr
+#define REGISTER_QASYMM8_SVE(func_name)  nullptr
 #define REGISTER_QASYMM8_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QASYMM8_KERNELS) */
 
@@ -137,7 +137,7 @@
 
 #else /* defined(ENABLE_QSYMM16_KERNELS) */
 #define REGISTER_QSYMM16_NEON(func_name) nullptr
-#define REGISTER_QSYMM16_SVE(func_name) nullptr
+#define REGISTER_QSYMM16_SVE(func_name)  nullptr
 #define REGISTER_QSYMM16_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_QSYMM16_KERNELS) */
 
@@ -169,7 +169,7 @@
 
 #else /* defined(ENABLE_INTEGER_KERNELS) */
 #define REGISTER_INTEGER_NEON(func_name) nullptr
-#define REGISTER_INTEGER_SVE(func_name) nullptr
+#define REGISTER_INTEGER_SVE(func_name)  nullptr
 #define REGISTER_INTEGER_SVE2(func_name) nullptr
 #endif /* defined(ENABLE_INTEGER_KERNELS) */
 

diff --git a/src/core/helpers/AutoConfiguration.h b/src/core/helpers/AutoConfiguration.h
index 8715dcd..9df2a76 100644
--- a/src/core/helpers/AutoConfiguration.h
+++ b/src/core/helpers/AutoConfiguration.h

@@ -24,9 +24,9 @@
 #ifndef SRC_CORE_HELPERS_AUTOCONFIGURATION_H
 #define SRC_CORE_HELPERS_AUTOCONFIGURATION_H
 
-#include "arm_compute/core/utils/DataTypeUtils.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
 
 namespace arm_compute
 {
@@ -42,10 +42,11 @@
  */
 inline bool auto_init_if_empty(ITensorInfo       &info,
                                const TensorShape &shape,
-                               int num_channels, DataType data_type,
-                               QuantizationInfo quantization_info = QuantizationInfo())
+                               int                num_channels,
+                               DataType           data_type,
+                               QuantizationInfo   quantization_info = QuantizationInfo())
 {
-    if(info.tensor_shape().total_size() == 0)
+    if (info.tensor_shape().total_size() == 0)
     {
         info.set_data_type(data_type);
         info.set_num_channels(num_channels);
@@ -70,7 +71,7 @@
  */
 inline bool auto_init_if_empty(ITensorInfo &info_sink, const ITensorInfo &info_source)
 {
-    if(info_sink.tensor_shape().total_size() == 0)
+    if (info_sink.tensor_shape().total_size() == 0)
     {
         info_sink.set_data_type(info_source.data_type());
         info_sink.set_num_channels(info_source.num_channels());
@@ -93,7 +94,7 @@
  */
 inline bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
 {
-    if(info.tensor_shape().total_size() == 0)
+    if (info.tensor_shape().total_size() == 0)
     {
         info.set_tensor_shape(shape);
         return true;
@@ -112,7 +113,7 @@
  */
 inline bool set_format_if_unknown(ITensorInfo &info, Format format)
 {
-    if(info.data_type() == DataType::UNKNOWN)
+    if (info.data_type() == DataType::UNKNOWN)
     {
         info.set_format(format);
         return true;
@@ -131,7 +132,7 @@
  */
 inline bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
 {
-    if(info.data_type() == DataType::UNKNOWN)
+    if (info.data_type() == DataType::UNKNOWN)
     {
         info.set_data_type(data_type);
         return true;
@@ -150,7 +151,7 @@
  */
 inline bool set_data_layout_if_unknown(ITensorInfo &info, DataLayout data_layout)
 {
-    if(info.data_layout() == DataLayout::UNKNOWN)
+    if (info.data_layout() == DataLayout::UNKNOWN)
     {
         info.set_data_layout(data_layout);
         return true;
@@ -169,7 +170,7 @@
  */
 inline bool set_quantization_info_if_empty(ITensorInfo &info, QuantizationInfo quantization_info)
 {
-    if(info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
+    if (info.quantization_info().empty() && (is_data_type_quantized_asymmetric(info.data_type())))
     {
         info.set_quantization_info(quantization_info);
         return true;

diff --git a/src/core/helpers/MemoryHelpers.h b/src/core/helpers/MemoryHelpers.h
index a410526..dd094b4 100644
--- a/src/core/helpers/MemoryHelpers.h
+++ b/src/core/helpers/MemoryHelpers.h

@@ -24,9 +24,9 @@
 #ifndef SRC_COMMON_MEMORY_HELPERS_H
 #define SRC_COMMON_MEMORY_HELPERS_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 
 #include <memory>
@@ -43,18 +43,17 @@
 template <typename TensorType>
 struct WorkspaceDataElement
 {
-    int                          slot{ -1 };
-    experimental::MemoryLifetime lifetime{ experimental::MemoryLifetime::Temporary };
-    std::unique_ptr<TensorType>  tensor{ nullptr };
+    int                          slot{-1};
+    experimental::MemoryLifetime lifetime{experimental::MemoryLifetime::Temporary};
+    std::unique_ptr<TensorType>  tensor{nullptr};
 };
 
 template <typename TensorType>
 using WorkspaceData = std::vector<WorkspaceDataElement<TensorType>>;
 
 template <typename TensorType>
-WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
-                                           MemoryGroup                            &mgroup,
-                                           ITensorPack                            &run_pack)
+WorkspaceData<TensorType>
+manage_workspace(const experimental::MemoryRequirements &mem_reqs, MemoryGroup &mgroup, ITensorPack &run_pack)
 {
     ITensorPack dummy_pack = ITensorPack();
     return manage_workspace<TensorType>(mem_reqs, mgroup, run_pack, dummy_pack);
@@ -63,24 +62,26 @@
 template <typename TensorType>
 WorkspaceData<TensorType> manage_workspace(const experimental::MemoryRequirements &mem_reqs,
                                            MemoryGroup                            &mgroup,
-                                           ITensorPack &run_pack, ITensorPack &prep_pack)
+                                           ITensorPack                            &run_pack,
+                                           ITensorPack                            &prep_pack)
 {
     WorkspaceData<TensorType> workspace_memory;
-    for(const auto &req : mem_reqs)
+    for (const auto &req : mem_reqs)
     {
-        if(req.size == 0)
+        if (req.size == 0)
         {
             continue;
         }
 
-        const auto aux_info = TensorInfo{ TensorShape(req.size), 1, DataType::U8 };
-        workspace_memory.emplace_back(WorkspaceDataElement<TensorType> { req.slot, req.lifetime, std::make_unique<TensorType>() });
+        const auto aux_info = TensorInfo{TensorShape(req.size), 1, DataType::U8};
+        workspace_memory.emplace_back(
+            WorkspaceDataElement<TensorType>{req.slot, req.lifetime, std::make_unique<TensorType>()});
 
         auto aux_tensor = workspace_memory.back().tensor.get();
         ARM_COMPUTE_ERROR_ON_NULLPTR(aux_tensor);
         aux_tensor->allocator()->init(aux_info, req.alignment);
 
-        if(req.lifetime == experimental::MemoryLifetime::Temporary)
+        if (req.lifetime == experimental::MemoryLifetime::Temporary)
         {
             mgroup.manage(aux_tensor);
         }
@@ -91,7 +92,7 @@
         run_pack.add_tensor(req.slot, aux_tensor);
     }
 
-    for(auto &mem : workspace_memory)
+    for (auto &mem : workspace_memory)
     {
         auto tensor = mem.tensor.get();
         tensor->allocator()->allocate();
@@ -103,31 +104,29 @@
 template <typename TensorType>
 void release_prepare_tensors(WorkspaceData<TensorType> &workspace, ITensorPack &prep_pack)
 {
-    workspace.erase(std::remove_if(workspace.begin(),
-                                   workspace.end(),
-                                   [&prep_pack](auto & wk)
-    {
-        const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
-        if(to_erase)
-        {
-            prep_pack.remove_tensor(wk.slot);
-        }
-        return to_erase;
-    }),
-    workspace.end());
+    workspace.erase(std::remove_if(workspace.begin(), workspace.end(),
+                                   [&prep_pack](auto &wk)
+                                   {
+                                       const bool to_erase = wk.lifetime == experimental::MemoryLifetime::Prepare;
+                                       if (to_erase)
+                                       {
+                                           prep_pack.remove_tensor(wk.slot);
+                                       }
+                                       return to_erase;
+                                   }),
+                    workspace.end());
 }
 
 /** Utility function to release tensors with lifetime marked as Prepare */
 template <typename TensorType>
-void release_temporaries(const experimental::MemoryRequirements &mem_reqs,
-                         WorkspaceData<TensorType>              &workspace)
+void release_temporaries(const experimental::MemoryRequirements &mem_reqs, WorkspaceData<TensorType> &workspace)
 {
-    for(auto &ws : workspace)
+    for (auto &ws : workspace)
     {
         const int slot = ws.slot;
-        for(auto &m : mem_reqs)
+        for (auto &m : mem_reqs)
         {
-            if(m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare)
+            if (m.slot == slot && m.lifetime == experimental::MemoryLifetime::Prepare)
             {
                 auto tensor = ws.tensor.get();
                 tensor->allocator()->free();

diff --git a/src/core/helpers/PoolingHelpers.h b/src/core/helpers/PoolingHelpers.h
index 079629e..9ef045f 100644
--- a/src/core/helpers/PoolingHelpers.h
+++ b/src/core/helpers/PoolingHelpers.h

@@ -33,8 +33,20 @@
 namespace
 {
 
-inline float calculate_avg_scale_pool3d(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
-                                 const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
+inline float calculate_avg_scale_pool3d(bool               exclude_padding,
+                                        const Coordinates &id,
+                                        const int          pool_size_x,
+                                        const int          pool_size_y,
+                                        const int          pool_size_z,
+                                        const int          upper_bound_w,
+                                        const int          upper_bound_h,
+                                        const int          upper_bound_d,
+                                        const int          pad_x,
+                                        const int          pad_y,
+                                        const int          pad_z,
+                                        const int          stride_x,
+                                        const int          stride_y,
+                                        const int          stride_z)
 {
     // Based on NDHWC
     int start_x = id[1] * stride_x - pad_x;
@@ -44,7 +56,7 @@
     const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
     const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
     const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
-    if(exclude_padding)
+    if (exclude_padding)
     {
         start_x = std::max(0, start_x);
         start_y = std::max(0, start_y);
@@ -53,8 +65,17 @@
     return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
 }
 
-inline float calculate_avg_scale_pool2d(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
-                                 const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+inline float calculate_avg_scale_pool2d(bool               exclude_padding,
+                                        DataLayout         data_layout,
+                                        const Coordinates &id,
+                                        const int          pool_size_x,
+                                        const int          pool_size_y,
+                                        const int          upper_bound_w,
+                                        const int          upper_bound_h,
+                                        const int          pad_x,
+                                        const int          pad_y,
+                                        const int          stride_x,
+                                        const int          stride_y)
 {
     const unsigned int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -64,7 +85,7 @@
 
     const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
     const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
-    if(exclude_padding)
+    if (exclude_padding)
     {
         start_x = std::max(0, start_x);
         start_y = std::max(0, start_y);
@@ -117,17 +138,26 @@
 }
 
 template <typename Tout>
-inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset);
+inline Tout vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                           const float          quant_rescale,
+                                           const float          scale_pooling,
+                                           const int32_t        new_offset);
 
 template <>
-inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+inline uint8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                                 const float          quant_rescale,
+                                                 const float          scale_pooling,
+                                                 const int32_t        new_offset)
 {
     const float new_scale = quant_rescale / scale_pooling;
     return vquantize(acc, UniformQuantizationInfo(new_scale, new_offset));
 }
 
 template <>
-inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc, const float quant_rescale, const float scale_pooling, const int32_t new_offset)
+inline int8x16_t vrequantize_pooling_with_scale(const float32x4x4_t &acc,
+                                                const float          quant_rescale,
+                                                const float          scale_pooling,
+                                                const int32_t        new_offset)
 {
     const float new_scale = quant_rescale / scale_pooling;
     return vquantize_signed(acc, UniformQuantizationInfo(new_scale, new_offset));
@@ -139,30 +169,24 @@
 template <>
 inline uint8x16_t vrequantize_pooling(uint8x8_t vec1, uint8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
-        }
-    };
+    const float32x4x4_t acc = {{
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec1))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec1))))),
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec2))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec2))))),
+    }};
     return vquantize(acc, requant_qinfo);
 }
 
 template <>
 inline int8x16_t vrequantize_pooling(int8x8_t vec1, int8x8_t vec2, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x4_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
-        }
-    };
+    const float32x4x4_t acc = {{
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec1))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec1))))),
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec2))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec2))))),
+    }};
     return vquantize_signed(acc, requant_qinfo);
 }
 
@@ -172,26 +196,20 @@
 template <>
 inline uint8x8_t vrequantize_pooling(uint8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
-            vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
-        }
-    };
+    const float32x4x2_t acc = {{
+        vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8((vec))))),
+        vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8((vec))))),
+    }};
     return vquantize(acc, requant_qinfo);
 }
 
 template <>
 inline int8x8_t vrequantize_pooling(int8x8_t &vec, const UniformQuantizationInfo &requant_qinfo)
 {
-    const float32x4x2_t acc =
-    {
-        {
-            vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
-            vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
-        }
-    };
+    const float32x4x2_t acc = {{
+        vcvtq_f32_s32(vmovl_s16(vget_low_s16(vmovl_s8((vec))))),
+        vcvtq_f32_s32(vmovl_s16(vget_high_s16(vmovl_s8((vec))))),
+    }};
     return vquantize_signed(acc, requant_qinfo);
 }
 
@@ -199,4 +217,3 @@
 } // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_CORE_HELPERS_POOLINGHELPERS_H */
-

diff --git a/src/core/helpers/ScaleHelpers.h b/src/core/helpers/ScaleHelpers.h
index e769bba..47605e7 100644
--- a/src/core/helpers/ScaleHelpers.h
+++ b/src/core/helpers/ScaleHelpers.h

@@ -50,8 +50,12 @@
  *
  * @return The bilinear interpolated pixel value
  */
-inline uint8_t delta_bilinear_c1_quantized(const uint8_t *pixel_ptr, size_t stride, float dx, float dy,
-                                           UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline uint8_t delta_bilinear_c1_quantized(const uint8_t          *pixel_ptr,
+                                           size_t                  stride,
+                                           float                   dx,
+                                           float                   dy,
+                                           UniformQuantizationInfo iq_info,
+                                           UniformQuantizationInfo oq_info)
 {
     ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
 
@@ -85,8 +89,12 @@
  *
  * @return The bilinear interpolated pixel value
  */
-inline int8_t delta_bilinear_c1_quantized(const int8_t *pixel_ptr, size_t stride, float dx, float dy,
-                                          UniformQuantizationInfo iq_info, UniformQuantizationInfo oq_info)
+inline int8_t delta_bilinear_c1_quantized(const int8_t           *pixel_ptr,
+                                          size_t                  stride,
+                                          float                   dx,
+                                          float                   dy,
+                                          UniformQuantizationInfo iq_info,
+                                          UniformQuantizationInfo oq_info)
 {
     ARM_COMPUTE_ERROR_ON(pixel_ptr == nullptr);
 
@@ -122,9 +130,8 @@
  *
  * @return The pixel at (x, y) using area interpolation.
  */
-inline uint8_t
-pixel_area_c1u8_clamp(const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr,
-                      float hr, int x, int y)
+inline uint8_t pixel_area_c1u8_clamp(
+    const uint8_t *first_pixel_ptr, size_t stride, size_t width, size_t height, float wr, float hr, int x, int y)
 {
     ARM_COMPUTE_ERROR_ON(first_pixel_ptr == nullptr);
 
@@ -159,7 +166,7 @@
 
     // Sum pixels in area
     int sum = 0;
-    for(int j = yi + y_from, je = yi + y_to; j <= je; ++j)
+    for (int j = yi + y_from, je = yi + y_to; j <= je; ++j)
     {
         const uint8_t *ptr = first_pixel_ptr + j * stride + xi + x_from;
         sum                = std::accumulate(ptr, ptr + x_elements, sum);

diff --git a/src/core/helpers/SoftmaxHelpers.cpp b/src/core/helpers/SoftmaxHelpers.cpp
index 71b971a..8184991 100644
--- a/src/core/helpers/SoftmaxHelpers.cpp
+++ b/src/core/helpers/SoftmaxHelpers.cpp

@@ -29,7 +29,7 @@
 {
 PermutationVector get_permutation_vector_from_softmax_axis(size_t axis)
 {
-    switch(axis)
+    switch (axis)
     {
         case 1:
             return PermutationVector(1U, 0U, 2U, 3U);

diff --git a/src/core/helpers/Utils.cpp b/src/core/helpers/Utils.cpp
index 3900475..6ca29d1 100644
--- a/src/core/helpers/Utils.cpp
+++ b/src/core/helpers/Utils.cpp

@@ -31,9 +31,9 @@
     const auto &strides        = info.strides_in_bytes();
     size_t      squashed_bytes = info.element_size();
 
-    for(size_t dim = 0; dim <= dimension; ++dim)
+    for (size_t dim = 0; dim <= dimension; ++dim)
     {
-        if(strides[dim] != squashed_bytes)
+        if (strides[dim] != squashed_bytes)
         {
             return true;
         }

diff --git a/src/core/helpers/Utils.h b/src/core/helpers/Utils.h
index 7ad960b..2e7224c 100644
--- a/src/core/helpers/Utils.h
+++ b/src/core/helpers/Utils.h

@@ -45,7 +45,7 @@
     // Create strides object
     Strides strides(stride_x, fixed_strides...);
 
-    for(size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
+    for (size_t i = 1 + sizeof...(Ts); i < info.num_dimensions(); ++i)
     {
         strides.set(i, shape[i - 1] * strides[i - 1]);
     }

diff --git a/src/core/helpers/WindowHelpers.cpp b/src/core/helpers/WindowHelpers.cpp
index a4d46db..30a55fc 100644
--- a/src/core/helpers/WindowHelpers.cpp
+++ b/src/core/helpers/WindowHelpers.cpp

@@ -25,9 +25,10 @@
 
 namespace arm_compute
 {
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window
+calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
 {
-    if(!skip_border)
+    if (!skip_border)
     {
         border_size = BorderSize(0);
     }
@@ -38,40 +39,47 @@
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      anchor[0] + border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] + border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
-        window.set(1, Window::Dimension(
+        window.set(1,
+                   Window::Dimension(
                        // Skip the border above the image
                        anchor[1] + border_size.top,
                        // Skip the border below the image
-                       anchor[1] + border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
+                       anchor[1] + border_size.top +
+                           ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) -
+                                                            static_cast<int>(border_size.bottom)),
+                                            steps[1]),
                        steps[1]));
 
         ++n;
     }
 
-    if(anchor.num_dimensions() > 2)
+    if (anchor.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(anchor[2], std::max<size_t>(1, shape[2]), steps[2]));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -81,7 +89,7 @@
 
 Window calculate_max_window(const TensorShape &shape, const Steps &steps, bool skip_border, BorderSize border_size)
 {
-    if(!skip_border)
+    if (!skip_border)
     {
         border_size = BorderSize(0);
     }
@@ -89,40 +97,46 @@
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(shape.num_dimensions() > 1)
+    if (shape.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       border_size.top,
-                       // Skip the border below the image
-                       border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) - static_cast<int>(border_size.top) - static_cast<int>(border_size.bottom)), steps[1]),
-                       steps[1]));
+                          // Skip the border above the image
+                          border_size.top,
+                          // Skip the border below the image
+                          border_size.top + ceil_to_multiple(std::max(0, static_cast<int>(shape[1]) -
+                                                                             static_cast<int>(border_size.top) -
+                                                                             static_cast<int>(border_size.bottom)),
+                                                             steps[1]),
+                          steps[1]));
 
         ++n;
     }
 
-    if(shape.num_dimensions() > 2)
+    if (shape.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[2]), steps[2]));
 
         ++n;
     }
 
-    for(; n < shape.num_dimensions(); ++n)
+    for (; n < shape.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(0, std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -138,40 +152,42 @@
     Window window;
 
     window.set(0, Window::Dimension(
-                   // move the anchor to the start from the border
-                   anchor[0] - border_size.left,
-                   // move the anchor to include the right end border
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] - border_size.left + ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
-                   steps[0]));
+                      // move the anchor to the start from the border
+                      anchor[0] - border_size.left,
+                      // move the anchor to include the right end border
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] - border_size.left +
+                          ceil_to_multiple(shape[0] + border_size.left + border_size.right, steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Include the border above the image
-                       anchor[1] - border_size.top,
-                       // Include the border below the image
-                       anchor[1] - border_size.top + ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
-                       steps[1]));
+                          // Include the border above the image
+                          anchor[1] - border_size.top,
+                          // Include the border below the image
+                          anchor[1] - border_size.top +
+                              ceil_to_multiple(shape[1] + border_size.top + border_size.bottom, steps[1]),
+                          steps[1]));
 
         ++n;
     }
 
-    if(anchor.num_dimensions() > 2)
+    if (anchor.num_dimensions() > 2)
     {
         window.set(2, Window::Dimension(0, std::max<size_t>(1, shape[n]), steps[2]));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -179,9 +195,12 @@
     return window;
 }
 
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+                                       const Steps       &steps,
+                                       bool               skip_border,
+                                       BorderSize         border_size)
 {
-    if(skip_border)
+    if (skip_border)
     {
         border_size.top    = 0;
         border_size.bottom = 0;
@@ -198,33 +217,35 @@
     Window window;
 
     window.set(0, Window::Dimension(
-                   // Skip the border left of the image
-                   anchor[0] + border_size.left,
-                   // Skip the border right of the image
-                   // Make sure the window width is a multiple of the step size
-                   anchor[0] + border_size.left + ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) - static_cast<int>(border_size.right)), steps[0]),
-                   steps[0]));
+                      // Skip the border left of the image
+                      anchor[0] + border_size.left,
+                      // Skip the border right of the image
+                      // Make sure the window width is a multiple of the step size
+                      anchor[0] + border_size.left +
+                          ceil_to_multiple(std::max(0, static_cast<int>(shape[0]) - static_cast<int>(border_size.left) -
+                                                           static_cast<int>(border_size.right)),
+                                           steps[0]),
+                      steps[0]));
 
     size_t n = 1;
 
-    if(anchor.num_dimensions() > 1)
+    if (anchor.num_dimensions() > 1)
     {
         window.set(1, Window::Dimension(
-                       // Skip the border above the image
-                       anchor[1] - border_size.top,
-                       // Skip the border below the image
-                       anchor[1] + shape[1] + border_size.bottom,
-                       1));
+                          // Skip the border above the image
+                          anchor[1] - border_size.top,
+                          // Skip the border below the image
+                          anchor[1] + shape[1] + border_size.bottom, 1));
 
         ++n;
     }
 
-    for(; n < anchor.num_dimensions(); ++n)
+    for (; n < anchor.num_dimensions(); ++n)
     {
         window.set(n, Window::Dimension(anchor[n], std::max<size_t>(1, shape[n])));
     }
 
-    for(; n < Coordinates::num_max_dimensions; ++n)
+    for (; n < Coordinates::num_max_dimensions; ++n)
     {
         window.set(n, Window::Dimension(0, 1));
     }
@@ -247,9 +268,9 @@
     size_t squashed_bytes = src0.element_size();
 
     // Try to squash the low dimensions together.
-    for(; dim < num_dimensions; ++dim)
+    for (; dim < num_dimensions; ++dim)
     {
-        if(shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes)
+        if (shape0[dim] != shape1[dim] || strides0[dim] != squashed_bytes || strides1[dim] != squashed_bytes)
         {
             break;
         }
@@ -257,7 +278,7 @@
         squashed_bytes *= shape0[dim];
     }
 
-    if(dim == num_dimensions)
+    if (dim == num_dimensions)
     {
         auto squashed_elements = squashed_bytes / src0.element_size();
 
@@ -266,7 +287,7 @@
         // The input tensors can be interpreted as 1D array.
         win.set(0, Window::Dimension(0, squashed_elements, 1));
 
-        for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, 1, 1));
         }
@@ -274,7 +295,7 @@
     else
     {
         // Generates the max window.
-        for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, std::max(shape0[dim], shape1[dim]), 1));
         }
@@ -295,21 +316,21 @@
     size_t squashed_bytes  = src.element_size();
 
     // Try to squash the low dimensions together.
-    for(; dim < num_dimensions; ++dim)
+    for (; dim < num_dimensions; ++dim)
     {
-        if(strides[dim] != squashed_bytes)
+        if (strides[dim] != squashed_bytes)
         {
             break;
         }
         squashed_bytes *= shape[dim];
     }
-    if(dim == num_dimensions)
+    if (dim == num_dimensions)
     {
         const auto squashed_elements = squashed_bytes / src.element_size();
         split_dimension              = Window::DimX;
         // The input tensor can be interpreted as 1D array.
         win.set(0, Window::Dimension(0, squashed_elements, 1));
-        for(dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 1; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, 1, 1));
         }
@@ -317,7 +338,7 @@
     else
     {
         // Generate the max window.
-        for(dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
+        for (dim = 0; dim < Coordinates::num_max_dimensions; ++dim)
         {
             win.set(dim, Window::Dimension(0, shape[dim], 1));
         }

diff --git a/src/core/helpers/WindowHelpers.h b/src/core/helpers/WindowHelpers.h
index eccf7f2..e404c18 100644
--- a/src/core/helpers/WindowHelpers.h
+++ b/src/core/helpers/WindowHelpers.h

@@ -43,21 +43,13 @@
  *         influence the returned value.
  */
 template <typename... Ts>
-bool update_window_and_padding(Window &win, Ts &&... patterns)
+bool update_window_and_padding(Window &win, Ts &&...patterns)
 {
     bool window_changed = false;
 
-    utility::for_each([&](const IAccessWindow & w)
-    {
-        window_changed |= w.update_window_if_needed(win);
-    },
-    patterns...);
+    utility::for_each([&](const IAccessWindow &w) { window_changed |= w.update_window_if_needed(win); }, patterns...);
 
-    utility::for_each([&](IAccessWindow & w)
-    {
-        w.update_padding_if_needed(win);
-    },
-    patterns...);
+    utility::for_each([&](IAccessWindow &w) { w.update_padding_if_needed(win); }, patterns...);
 
     return window_changed;
 }
@@ -69,18 +61,18 @@
  * @return Intersection of all regions.
  */
 template <typename... Ts>
-ValidRegion intersect_valid_regions(const Ts &... regions)
+ValidRegion intersect_valid_regions(const Ts &...regions)
 {
-    auto intersect = [](const ValidRegion & r1, const ValidRegion & r2) -> ValidRegion
+    auto intersect = [](const ValidRegion &r1, const ValidRegion &r2) -> ValidRegion
     {
         ValidRegion region;
 
-        for(size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
+        for (size_t d = 0; d < std::min(r1.anchor.num_dimensions(), r2.anchor.num_dimensions()); ++d)
         {
             region.anchor.set(d, std::max(r1.anchor[d], r2.anchor[d]));
         }
 
-        for(size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
+        for (size_t d = 0; d < std::min(r1.shape.num_dimensions(), r2.shape.num_dimensions()); ++d)
         {
             region.shape.set(d, std::min(r1.shape[d], r2.shape[d]));
         }
@@ -101,7 +93,10 @@
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const ValidRegion &valid_region,
+                            const Steps       &steps       = Steps(),
+                            bool               skip_border = false,
+                            BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting
  *
@@ -112,7 +107,10 @@
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window(const TensorShape &shape, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window(const TensorShape &shape,
+                            const Steps       &steps       = Steps(),
+                            bool               skip_border = false,
+                            BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting
  *
@@ -123,7 +121,10 @@
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_window(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window(const ITensorInfo &info,
+                                   const Steps       &steps       = Steps(),
+                                   bool               skip_border = false,
+                                   BorderSize         border_size = BorderSize())
 {
     return calculate_max_window(info.tensor_shape(), steps, skip_border, border_size);
 }
@@ -137,7 +138,10 @@
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize());
+Window calculate_max_window_horizontal(const ValidRegion &valid_region,
+                                       const Steps       &steps       = Steps(),
+                                       bool               skip_border = false,
+                                       BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window used by a horizontal kernel for a given tensor shape and border setting
  *
@@ -148,7 +152,10 @@
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_window_horizontal(const ITensorInfo &info, const Steps &steps = Steps(), bool skip_border = false, BorderSize border_size = BorderSize())
+inline Window calculate_max_window_horizontal(const ITensorInfo &info,
+                                              const Steps       &steps       = Steps(),
+                                              bool               skip_border = false,
+                                              BorderSize         border_size = BorderSize())
 {
     return calculate_max_window_horizontal(info.valid_region(), steps, skip_border, border_size);
 }
@@ -161,7 +168,9 @@
  *
  * @return The maximum window the kernel can be executed on.
  */
-Window calculate_max_enlarged_window(const ValidRegion &valid_region, const Steps &steps = Steps(), BorderSize border_size = BorderSize());
+Window calculate_max_enlarged_window(const ValidRegion &valid_region,
+                                     const Steps       &steps       = Steps(),
+                                     BorderSize         border_size = BorderSize());
 
 /** Calculate the maximum window for a given tensor shape and border setting. The window will also includes the border.
  *
@@ -171,7 +180,9 @@
  *
  * @return The maximum window the kernel can be executed on.
  */
-inline Window calculate_max_enlarged_window(const ITensorInfo &info, const Steps &steps = Steps(), BorderSize border_size = BorderSize())
+inline Window calculate_max_enlarged_window(const ITensorInfo &info,
+                                            const Steps       &steps       = Steps(),
+                                            BorderSize         border_size = BorderSize())
 {
     return calculate_max_enlarged_window(info.valid_region(), steps, border_size);
 }
@@ -208,7 +219,7 @@
  * @return A pair of the shape and window
  */
 template <typename... Shapes>
-std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &... shapes)
+std::pair<TensorShape, Window> compute_output_shape_and_window(const Shapes &...shapes)
 {
     const TensorShape out_shape = TensorShape::broadcast_shape(shapes...);
     return std::make_pair(out_shape, calculate_max_window(out_shape));

diff --git a/src/core/utils/ActivationFunctionUtils.cpp b/src/core/utils/ActivationFunctionUtils.cpp
index 4854b8e..017170a 100644
--- a/src/core/utils/ActivationFunctionUtils.cpp
+++ b/src/core/utils/ActivationFunctionUtils.cpp

@@ -28,26 +28,24 @@
 
 namespace arm_compute
 {
-const std::string &string_from_activation_func(const ActivationFunction& act)
+const std::string &string_from_activation_func(const ActivationFunction &act)
 {
-    static std::map<ActivationFunction, const std::string> act_map =
-    {
-        { ActivationFunction::ABS, "ABS" },
-        { ActivationFunction::LINEAR, "LINEAR" },
-        { ActivationFunction::LOGISTIC, "LOGISTIC" },
-        { ActivationFunction::RELU, "RELU" },
-        { ActivationFunction::BOUNDED_RELU, "BRELU" },
-        { ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU" },
-        { ActivationFunction::LEAKY_RELU, "LRELU" },
-        { ActivationFunction::SOFT_RELU, "SRELU" },
-        { ActivationFunction::ELU, "ELU" },
-        { ActivationFunction::SQRT, "SQRT" },
-        { ActivationFunction::SQUARE, "SQUARE" },
-        { ActivationFunction::TANH, "TANH" },
-        { ActivationFunction::IDENTITY, "IDENTITY" },
-        { ActivationFunction::HARD_SWISH, "HARD_SWISH" },
-        { ActivationFunction::SWISH, "SWISH" },
-        { ActivationFunction::GELU, "GELU" }
+    static std::map<ActivationFunction, const std::string> act_map = {{ActivationFunction::ABS, "ABS"},
+                                                                      {ActivationFunction::LINEAR, "LINEAR"},
+                                                                      {ActivationFunction::LOGISTIC, "LOGISTIC"},
+                                                                      {ActivationFunction::RELU, "RELU"},
+                                                                      {ActivationFunction::BOUNDED_RELU, "BRELU"},
+                                                                      {ActivationFunction::LU_BOUNDED_RELU, "LU_BRELU"},
+                                                                      {ActivationFunction::LEAKY_RELU, "LRELU"},
+                                                                      {ActivationFunction::SOFT_RELU, "SRELU"},
+                                                                      {ActivationFunction::ELU, "ELU"},
+                                                                      {ActivationFunction::SQRT, "SQRT"},
+                                                                      {ActivationFunction::SQUARE, "SQUARE"},
+                                                                      {ActivationFunction::TANH, "TANH"},
+                                                                      {ActivationFunction::IDENTITY, "IDENTITY"},
+                                                                      {ActivationFunction::HARD_SWISH, "HARD_SWISH"},
+                                                                      {ActivationFunction::SWISH, "SWISH"},
+                                                                      {ActivationFunction::GELU, "GELU"}
 
     };
 

diff --git a/src/core/utils/AssemblyUtils.cpp b/src/core/utils/AssemblyUtils.cpp
index 6d483ad..d97ea42 100644
--- a/src/core/utils/AssemblyUtils.cpp
+++ b/src/core/utils/AssemblyUtils.cpp

@@ -34,12 +34,12 @@
     arm_gemm::Activation gemm_act;
 
     // Early exit in case lower bound is other than 0, as it's not yet supported
-    if(act.b() != 0.f)
+    if (act.b() != 0.f)
     {
         return gemm_act;
     }
 
-    switch(act.activation())
+    switch (act.activation())
     {
         case ActivationLayerInfo::ActivationFunction::RELU:
             gemm_act.type = arm_gemm::Activation::Type::ReLU;
@@ -63,17 +63,15 @@
 
 arm_conv::PaddingValues map_to_arm_conv_padding(const PadStrideInfo &pad_stride_info)
 {
-    return arm_conv::PaddingValues{ pad_stride_info.pad_left(),
-                                    pad_stride_info.pad_top(),
-                                    pad_stride_info.pad_right(),
-                                    pad_stride_info.pad_bottom() };
+    return arm_conv::PaddingValues{pad_stride_info.pad_left(), pad_stride_info.pad_top(), pad_stride_info.pad_right(),
+                                   pad_stride_info.pad_bottom()};
 }
 
 arm_gemm::WeightFormat map_to_arm_gemm_weight_format(const arm_compute::WeightFormat &weight_format)
 {
     arm_gemm::WeightFormat gemm_weight_fromat;
 
-    switch(weight_format)
+    switch (weight_format)
     {
         case arm_compute::WeightFormat::UNSPECIFIED:
             gemm_weight_fromat = arm_gemm::WeightFormat::UNSPECIFIED;
@@ -193,7 +191,7 @@
 {
     arm_compute::WeightFormat acl_weight_fromat;
 
-    switch(weight_format)
+    switch (weight_format)
     {
         case arm_gemm::WeightFormat::UNSPECIFIED:
             acl_weight_fromat = arm_compute::WeightFormat::UNSPECIFIED;

diff --git a/src/core/utils/AssemblyUtils.h b/src/core/utils/AssemblyUtils.h
index 60bad3b..7d0d37c 100644
--- a/src/core/utils/AssemblyUtils.h
+++ b/src/core/utils/AssemblyUtils.h

@@ -25,6 +25,7 @@
 #define UTILS_CORE_ASSEMBLY_UTILS_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/kernels/assembly/common.hpp"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
 
@@ -65,6 +66,6 @@
  * @return Compute Library WeightFormat
  */
 arm_compute::WeightFormat map_to_arm_compute_weight_format(const arm_gemm::WeightFormat &weight_format);
-} // namespace assembly
+} // namespace assembly_utils
 } // namespace arm_compute
 #endif /* UTILS_CORE_ASSEMBLY_UTILS_H */

diff --git a/src/core/utils/DataLayoutUtils.cpp b/src/core/utils/DataLayoutUtils.cpp
index 4919b79..234bed7 100644
--- a/src/core/utils/DataLayoutUtils.cpp
+++ b/src/core/utils/DataLayoutUtils.cpp

@@ -29,11 +29,10 @@
 
 const std::string &string_from_data_layout(DataLayout dl)
 {
-    static std::map<DataLayout, const std::string> dl_map =
-    {
-        { DataLayout::UNKNOWN, "UNKNOWN" },
-        { DataLayout::NCHW, "NCHW" },
-        { DataLayout::NHWC, "NHWC" },
+    static std::map<DataLayout, const std::string> dl_map = {
+        {DataLayout::UNKNOWN, "UNKNOWN"},
+        {DataLayout::NCHW, "NCHW"},
+        {DataLayout::NHWC, "NHWC"},
     };
 
     return dl_map[dl];

diff --git a/src/core/utils/DataTypeUtils.cpp b/src/core/utils/DataTypeUtils.cpp
index 0799935..1394339 100644
--- a/src/core/utils/DataTypeUtils.cpp
+++ b/src/core/utils/DataTypeUtils.cpp

@@ -30,27 +30,26 @@
 {
 const std::string &string_from_data_type(DataType dt)
 {
-    static std::map<DataType, const std::string> dt_map =
-    {
-        { DataType::UNKNOWN, "UNKNOWN" },
-        { DataType::S8, "S8" },
-        { DataType::U8, "U8" },
-        { DataType::S16, "S16" },
-        { DataType::U16, "U16" },
-        { DataType::S32, "S32" },
-        { DataType::U32, "U32" },
-        { DataType::S64, "S64" },
-        { DataType::U64, "U64" },
-        { DataType::F16, "F16" },
-        { DataType::F32, "F32" },
-        { DataType::F64, "F64" },
-        { DataType::SIZET, "SIZET" },
-        { DataType::QSYMM8, "QSYMM8" },
-        { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" },
-        { DataType::QASYMM8, "QASYMM8" },
-        { DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED" },
-        { DataType::QSYMM16, "QSYMM16" },
-        { DataType::QASYMM16, "QASYMM16" },
+    static std::map<DataType, const std::string> dt_map = {
+        {DataType::UNKNOWN, "UNKNOWN"},
+        {DataType::S8, "S8"},
+        {DataType::U8, "U8"},
+        {DataType::S16, "S16"},
+        {DataType::U16, "U16"},
+        {DataType::S32, "S32"},
+        {DataType::U32, "U32"},
+        {DataType::S64, "S64"},
+        {DataType::U64, "U64"},
+        {DataType::F16, "F16"},
+        {DataType::F32, "F32"},
+        {DataType::F64, "F64"},
+        {DataType::SIZET, "SIZET"},
+        {DataType::QSYMM8, "QSYMM8"},
+        {DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL"},
+        {DataType::QASYMM8, "QASYMM8"},
+        {DataType::QASYMM8_SIGNED, "QASYMM8_SIGNED"},
+        {DataType::QSYMM16, "QSYMM16"},
+        {DataType::QASYMM16, "QASYMM16"},
     };
 
     return dt_map[dt];
@@ -58,12 +57,11 @@
 
 DataType data_type_from_name(const std::string &name)
 {
-    static const std::map<std::string, DataType> data_types =
-    {
-        { "f16", DataType::F16 },
-        { "f32", DataType::F32 },
-        { "qasymm8", DataType::QASYMM8 },
-        { "qasymm8_signed", DataType::QASYMM8_SIGNED },
+    static const std::map<std::string, DataType> data_types = {
+        {"f16", DataType::F16},
+        {"f32", DataType::F32},
+        {"qasymm8", DataType::QASYMM8},
+        {"qasymm8_signed", DataType::QASYMM8_SIGNED},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -74,7 +72,7 @@
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         ARM_COMPUTE_ERROR_VAR("Invalid data type name: %s", name.c_str());
     }

diff --git a/src/core/utils/FormatUtils.cpp b/src/core/utils/FormatUtils.cpp
index 05b649d..46f8455 100644
--- a/src/core/utils/FormatUtils.cpp
+++ b/src/core/utils/FormatUtils.cpp

@@ -30,26 +30,16 @@
 {
 const std::string &string_from_format(Format format)
 {
-    static std::map<Format, const std::string> formats_map =
-    {
-        { Format::UNKNOWN, "UNKNOWN" },
-        { Format::U8, "U8" },
-        { Format::S16, "S16" },
-        { Format::U16, "U16" },
-        { Format::S32, "S32" },
-        { Format::U32, "U32" },
-        { Format::F16, "F16" },
-        { Format::F32, "F32" },
-        { Format::UV88, "UV88" },
-        { Format::RGB888, "RGB888" },
-        { Format::RGBA8888, "RGBA8888" },
-        { Format::YUV444, "YUV444" },
-        { Format::YUYV422, "YUYV422" },
-        { Format::NV12, "NV12" },
-        { Format::NV21, "NV21" },
-        { Format::IYUV, "IYUV" },
-        { Format::UYVY422, "UYVY422" }
-    };
+    static std::map<Format, const std::string> formats_map = {
+        {Format::UNKNOWN, "UNKNOWN"},   {Format::U8, "U8"},
+        {Format::S16, "S16"},           {Format::U16, "U16"},
+        {Format::S32, "S32"},           {Format::U32, "U32"},
+        {Format::F16, "F16"},           {Format::F32, "F32"},
+        {Format::UV88, "UV88"},         {Format::RGB888, "RGB888"},
+        {Format::RGBA8888, "RGBA8888"}, {Format::YUV444, "YUV444"},
+        {Format::YUYV422, "YUYV422"},   {Format::NV12, "NV12"},
+        {Format::NV21, "NV21"},         {Format::IYUV, "IYUV"},
+        {Format::UYVY422, "UYVY422"}};
 
     return formats_map[format];
 }

diff --git a/src/core/utils/InterpolationPolicyUtils.cpp b/src/core/utils/InterpolationPolicyUtils.cpp
index 2d6cabe..276e760 100644
--- a/src/core/utils/InterpolationPolicyUtils.cpp
+++ b/src/core/utils/InterpolationPolicyUtils.cpp

@@ -29,11 +29,10 @@
 
 const std::string &string_from_interpolation_policy(InterpolationPolicy policy)
 {
-    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map =
-    {
-        { InterpolationPolicy::AREA, "AREA" },
-        { InterpolationPolicy::BILINEAR, "BILINEAR" },
-        { InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR" },
+    static std::map<InterpolationPolicy, const std::string> interpolation_policy_map = {
+        {InterpolationPolicy::AREA, "AREA"},
+        {InterpolationPolicy::BILINEAR, "BILINEAR"},
+        {InterpolationPolicy::NEAREST_NEIGHBOR, "NEAREST_NEIGHBOUR"},
     };
 
     return interpolation_policy_map[policy];

diff --git a/src/core/utils/ScaleUtils.cpp b/src/core/utils/ScaleUtils.cpp
index ee57a8e..a92da39 100644
--- a/src/core/utils/ScaleUtils.cpp
+++ b/src/core/utils/ScaleUtils.cpp

@@ -23,11 +23,12 @@
  */
 
 #include "src/core/utils/ScaleUtils.h"
-#include "src/common/cpuinfo/CpuIsaInfo.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/TensorInfo.h"
 
+#include "src/common/cpuinfo/CpuIsaInfo.h"
+
 float arm_compute::scale_utils::calculate_resize_ratio(size_t input_size, size_t output_size, bool align_corners)
 {
     const size_t offset = (align_corners && output_size > 1) ? 1 : 0;
@@ -40,13 +41,15 @@
     return static_cast<float>(in) / static_cast<float>(out);
 }
 
-bool arm_compute::scale_utils::is_precomputation_required(DataLayout data_layout, DataType data_type,
-                                                          InterpolationPolicy policy, BorderMode border_mode)
+bool arm_compute::scale_utils::is_precomputation_required(DataLayout          data_layout,
+                                                          DataType            data_type,
+                                                          InterpolationPolicy policy,
+                                                          BorderMode          border_mode)
 {
     // Do not calculate precomputed weights and indices if kernel code doesn't use them
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
             case DataType::F16:
@@ -62,4 +65,4 @@
     }
 
     return true;
-}
\ No newline at end of file
+}

diff --git a/src/core/utils/ScaleUtils.h b/src/core/utils/ScaleUtils.h
index 1484824..d8dddc8 100644
--- a/src/core/utils/ScaleUtils.h
+++ b/src/core/utils/ScaleUtils.h

@@ -60,8 +60,11 @@
  *
  * @return True if precomputation is required
  */
-bool is_precomputation_required(DataLayout data_layout, DataType data_type, InterpolationPolicy policy, BorderMode border_mode);
+bool is_precomputation_required(DataLayout          data_layout,
+                                DataType            data_type,
+                                InterpolationPolicy policy,
+                                BorderMode          border_mode);
 
 } // namespace scale_utils
 } // namespace arm_compute
-#endif /* UTILS_CORE_SCALEUTILS_H */
\ No newline at end of file
+#endif /* UTILS_CORE_SCALEUTILS_H */

diff --git a/src/core/utils/StringUtils.cpp b/src/core/utils/StringUtils.cpp
index 6d05c9b..bcab0ce 100644
--- a/src/core/utils/StringUtils.cpp
+++ b/src/core/utils/StringUtils.cpp

@@ -55,7 +55,7 @@
     ss.precision(std::numeric_limits<float>::max_digits10);
     ss << val;
 
-    if(val != static_cast<int>(val))
+    if (val != static_cast<int>(val))
     {
         ss << "f";
     }
@@ -65,17 +65,11 @@
 
 std::string join(const std::vector<std::string> strings, const std::string &sep)
 {
-    if(strings.empty())
+    if (strings.empty())
     {
         return "";
     }
-    return std::accumulate(
-               std::next(strings.begin()),
-               strings.end(),
-               strings.at(0),
-               [&sep](const std::string & a, const std::string & b)
-    {
-        return a + sep + b;
-    });
+    return std::accumulate(std::next(strings.begin()), strings.end(), strings.at(0),
+                           [&sep](const std::string &a, const std::string &b) { return a + sep + b; });
 }
-}
+} // namespace arm_compute

diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp
index 64633c6..edc8d0e 100644
--- a/src/core/utils/helpers/fft.cpp
+++ b/src/core/utils/helpers/fft.cpp

@@ -37,7 +37,7 @@
     unsigned int              res = N;
 
     // Early exit if no supported factors are provided
-    if(supported_factors.empty())
+    if (supported_factors.empty())
     {
         return stages;
     }
@@ -46,10 +46,10 @@
     auto rfactor_it = supported_factors.rbegin();
 
     // Decomposition step
-    while(res != 0)
+    while (res != 0)
     {
         const unsigned int factor = *rfactor_it;
-        if(0 == (res % factor) && res >= factor)
+        if (0 == (res % factor) && res >= factor)
         {
             stages.push_back(factor);
             res /= factor;
@@ -57,9 +57,9 @@
         else
         {
             ++rfactor_it;
-            if(rfactor_it == supported_factors.rend())
+            if (rfactor_it == supported_factors.rend())
             {
-                if(res > 1)
+                if (res > 1)
                 {
                     // Couldn't decompose with given factors
                     stages.clear();
@@ -81,8 +81,9 @@
     std::vector<unsigned int> idx_digit_reverse;
 
     // Early exit in case N and fft stages do not match
-    const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
-    if(stages_prod != N)
+    const float stages_prod =
+        std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies<unsigned int>());
+    if (stages_prod != N)
     {
         return idx_digit_reverse;
     }
@@ -94,13 +95,13 @@
     unsigned int n_stages = fft_stages.size();
 
     // Scan elements
-    for(unsigned int n = 0; n < N; ++n)
+    for (unsigned int n = 0; n < N; ++n)
     {
         unsigned int k  = n;
         unsigned int Nx = fft_stages[0];
 
         // Scan stages
-        for(unsigned int s = 1; s < n_stages; ++s)
+        for (unsigned int s = 1; s < n_stages; ++s)
         {
             // radix of stage i-th
             unsigned int Ny = fft_stages[s];

diff --git a/src/core/utils/helpers/float_ops.h b/src/core/utils/helpers/float_ops.h
index 99e1ea5..7f7fbd1 100644
--- a/src/core/utils/helpers/float_ops.h
+++ b/src/core/utils/helpers/float_ops.h

@@ -39,8 +39,7 @@
      *
      * @param[in] val Floating-point value
      */
-    explicit RawFloat(float val)
-        : f32(val)
+    explicit RawFloat(float val) : f32(val)
     {
     }
     /** Extract sign of floating point number

diff --git a/src/core/utils/helpers/tensor_info.h b/src/core/utils/helpers/tensor_info.h
index 9279532..fd4745a 100644
--- a/src/core/utils/helpers/tensor_info.h
+++ b/src/core/utils/helpers/tensor_info.h

@@ -41,15 +41,17 @@
  * @return True if tensors have mismatching quantization info else false.
  */
 template <typename... Ts>
-inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1, const ITensorInfo *tensor_info_2, Ts... tensor_infos)
+inline bool tensors_have_different_quantization_info(const ITensorInfo *tensor_info_1,
+                                                     const ITensorInfo *tensor_info_2,
+                                                     Ts... tensor_infos)
 {
     const QuantizationInfo first_quantization_info = tensor_info_1->quantization_info();
 
-    const std::array < const ITensorInfo *, 1 + sizeof...(Ts) > tensor_infos_array{ { tensor_info_2, std::forward<Ts>(tensor_infos)... } };
-    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(), [&](const ITensorInfo * tensor_info)
-    {
-        return tensor_info->quantization_info() != first_quantization_info;
-    });
+    const std::array<const ITensorInfo *, 1 + sizeof...(Ts)> tensor_infos_array{
+        {tensor_info_2, std::forward<Ts>(tensor_infos)...}};
+    return std::any_of(tensor_infos_array.begin(), tensor_infos_array.end(),
+                       [&](const ITensorInfo *tensor_info)
+                       { return tensor_info->quantization_info() != first_quantization_info; });
 }
 } // namespace tensor_info
 } // namespace helpers

diff --git a/src/core/utils/helpers/tensor_transform.cpp b/src/core/utils/helpers/tensor_transform.cpp
index f221699..19d0bad 100644
--- a/src/core/utils/helpers/tensor_transform.cpp
+++ b/src/core/utils/helpers/tensor_transform.cpp

@@ -36,10 +36,11 @@
     return index >= static_cast<int>(strides.num_dimensions()) ? 1 : strides[index];
 }
 
-int calculate_start_on_index(TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
+int calculate_start_on_index(
+    TensorShape input_shape, int index, Coordinates starts, Coordinates strides, int32_t begin_mask)
 {
     // Early exit
-    if(index >= static_cast<int>(starts.num_dimensions()))
+    if (index >= static_cast<int>(starts.num_dimensions()))
     {
         return 0;
     }
@@ -51,14 +52,14 @@
     int start = starts[index];
 
     // Reset in case of begin mask present
-    if(arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
+    if (arm_compute::helpers::bit_ops::is_bit_set(begin_mask, index))
     {
         start = stride > 0 ? std::numeric_limits<int>::lowest() : std::numeric_limits<int>::max();
     }
 
     // Account negative start points
     const int dim_size = input_shape[index];
-    if(start < 0)
+    if (start < 0)
     {
         start += dim_size;
     }
@@ -69,12 +70,16 @@
     return start;
 }
 
-int calculate_end_on_index(TensorShape input_shape, int index, int start_on_index,
-                           Coordinates ends, Coordinates strides,
-                           int32_t end_mask, int32_t shrink_axis_mask)
+int calculate_end_on_index(TensorShape input_shape,
+                           int         index,
+                           int         start_on_index,
+                           Coordinates ends,
+                           Coordinates strides,
+                           int32_t     end_mask,
+                           int32_t     shrink_axis_mask)
 {
     // Early exit
-    if(index >= static_cast<int>(ends.num_dimensions()))
+    if (index >= static_cast<int>(ends.num_dimensions()))
     {
         return input_shape[index];
     }
@@ -86,9 +91,9 @@
     int stop = ends[index];
 
     // Shrink dimension
-    if(shrink_axis)
+    if (shrink_axis)
     {
-        if(start_on_index == std::numeric_limits<int>::max())
+        if (start_on_index == std::numeric_limits<int>::max())
         {
             stop = start_on_index;
         }
@@ -99,14 +104,14 @@
     }
 
     // Reset in case of begin mask present
-    if(arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
+    if (arm_compute::helpers::bit_ops::is_bit_set(end_mask, index) && !shrink_axis)
     {
         stop = (stride > 0) ? std::numeric_limits<int>::max() : std::numeric_limits<int>::lowest();
     }
 
     // Account negative end points
     const int dim_size = input_shape[index];
-    if(stop < 0)
+    if (stop < 0)
     {
         stop += dim_size;
     }
@@ -118,14 +123,18 @@
 }
 
 std::tuple<Coordinates, Coordinates, Coordinates> calculate_strided_slice_coords(TensorShape input_shape,
-                                                                                 Coordinates starts, Coordinates ends, Coordinates strides,
-                                                                                 int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+                                                                                 Coordinates starts,
+                                                                                 Coordinates ends,
+                                                                                 Coordinates strides,
+                                                                                 int32_t     begin_mask,
+                                                                                 int32_t     end_mask,
+                                                                                 int32_t     shrink_axis_mask)
 {
     Coordinates starts_abs{};
     Coordinates ends_abs{};
     Coordinates final_strides{};
 
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const int start_i = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
         starts_abs.set(i, start_i);
@@ -136,13 +145,19 @@
     return std::make_tuple(starts_abs, ends_abs, final_strides);
 }
 
-TensorShape compute_strided_slice_output_shape(TensorShape input_shape, Coordinates starts, Coordinates ends, Coordinates strides,
-                                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask, bool return_unshrinked)
+TensorShape compute_strided_slice_output_shape(TensorShape input_shape,
+                                               Coordinates starts,
+                                               Coordinates ends,
+                                               Coordinates strides,
+                                               int32_t     begin_mask,
+                                               int32_t     end_mask,
+                                               int32_t     shrink_axis_mask,
+                                               bool        return_unshrinked)
 {
     unsigned int index = 0;
 
     TensorShape output_shape;
-    for(unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < input_shape.num_dimensions(); ++i)
     {
         const int stride = calculate_stride_on_index(index, strides);
         const int start  = calculate_start_on_index(input_shape, i, starts, strides, begin_mask);
@@ -150,11 +165,11 @@
         const int range  = end - start;
 
         const bool is_shrink = arm_compute::helpers::bit_ops::is_bit_set(shrink_axis_mask, i);
-        if(return_unshrinked || !is_shrink)
+        if (return_unshrinked || !is_shrink)
         {
-            if((range == 0) ||               // Zero range
-               (range < 0 && stride >= 0) || // Negative range with positive stride
-               (range > 0 && stride <= 0))   // Positive range with negative stride
+            if ((range == 0) ||               // Zero range
+                (range < 0 && stride >= 0) || // Negative range with positive stride
+                (range > 0 && stride <= 0))   // Positive range with negative stride
             {
                 output_shape.set(index, 0);
                 return output_shape;
@@ -173,9 +188,9 @@
 {
     // Create end mask
     int32_t end_mask = 0;
-    for(unsigned int i = 0; i < ends.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < ends.num_dimensions(); ++i)
     {
-        if(ends[i] < 0)
+        if (ends[i] < 0)
         {
             end_mask |= 1 << i;
         }

diff --git a/src/core/utils/io/FileHandler.cpp b/src/core/utils/io/FileHandler.cpp
index 95fc2e3..d106493 100644
--- a/src/core/utils/io/FileHandler.cpp
+++ b/src/core/utils/io/FileHandler.cpp

@@ -21,16 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <string>
-
 #include "arm_compute/core/utils/io/FileHandler.h"
 
 #include "arm_compute/core/Error.h"
 
+#include <string>
+
 using namespace arm_compute::io;
 
-FileHandler::FileHandler()
-    : _filestream(), _filename(" "), _mode()
+FileHandler::FileHandler() : _filestream(), _filename(" "), _mode()
 {
 }
 

diff --git a/src/core/utils/logging/FilePrinter.cpp b/src/core/utils/logging/FilePrinter.cpp
index 55e78f9..7b4eead 100644
--- a/src/core/utils/logging/FilePrinter.cpp
+++ b/src/core/utils/logging/FilePrinter.cpp

@@ -25,8 +25,7 @@
 
 using namespace arm_compute::logging;
 
-FilePrinter::FilePrinter(const std::string &filename)
-    : _handler()
+FilePrinter::FilePrinter(const std::string &filename) : _handler()
 {
     _handler.open(filename, std::fstream::out | std::fstream::trunc);
 }
@@ -34,4 +33,4 @@
 void FilePrinter::print_internal(const std::string &msg)
 {
     _handler.stream() << msg << std::endl;
-}
\ No newline at end of file
+}

diff --git a/src/core/utils/logging/Helpers.cpp b/src/core/utils/logging/Helpers.cpp
index c3df7f6..14ad910 100644
--- a/src/core/utils/logging/Helpers.cpp
+++ b/src/core/utils/logging/Helpers.cpp

@@ -30,13 +30,12 @@
 
 const std::string &arm_compute::logging::string_from_log_level(LogLevel log_level)
 {
-    static std::map<LogLevel, const std::string> log_level_map =
-    {
-        { LogLevel::VERBOSE, "VERBOSE" },
-        { LogLevel::INFO, "INFO" },
-        { LogLevel::WARN, "WARN" },
-        { LogLevel::OFF, "OFF" },
+    static std::map<LogLevel, const std::string> log_level_map = {
+        {LogLevel::VERBOSE, "VERBOSE"},
+        {LogLevel::INFO, "INFO"},
+        {LogLevel::WARN, "WARN"},
+        {LogLevel::OFF, "OFF"},
     };
 
     return log_level_map[log_level];
-}
\ No newline at end of file
+}

diff --git a/src/core/utils/logging/Logger.cpp b/src/core/utils/logging/Logger.cpp
index 70b5868..d6681f8 100644
--- a/src/core/utils/logging/Logger.cpp
+++ b/src/core/utils/logging/Logger.cpp

@@ -30,10 +30,7 @@
 using namespace arm_compute::logging;
 
 Logger::Logger(std::string name, LogLevel log_level, std::shared_ptr<Printer> printer)
-    : _name(std::move(name)), _log_level(log_level), _printers(
-{
-    std::move(printer)
-}), _decorators()
+    : _name(std::move(name)), _log_level(log_level), _printers({std::move(printer)}), _decorators()
 {
     // Check printer
     ARM_COMPUTE_ERROR_ON(printer == nullptr);
@@ -46,7 +43,7 @@
     : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators()
 {
     // Check printers
-    for(const auto &p : _printers)
+    for (const auto &p : _printers)
     {
         ARM_COMPUTE_UNUSED(p);
         ARM_COMPUTE_ERROR_ON(p == nullptr);
@@ -62,13 +59,13 @@
     : _name(std::move(name)), _log_level(log_level), _printers(std::move(printers)), _decorators(std::move(decorators))
 {
     // Check printers
-    for(const auto &p : _printers)
+    for (const auto &p : _printers)
     {
         ARM_COMPUTE_UNUSED(p);
         ARM_COMPUTE_ERROR_ON(p == nullptr);
     }
     // Check decorators
-    for(const auto &d : _decorators)
+    for (const auto &d : _decorators)
     {
         ARM_COMPUTE_UNUSED(d);
         ARM_COMPUTE_ERROR_ON(d == nullptr);
@@ -79,7 +76,7 @@
 {
     // Return if message shouldn't be logged
     // i.e. if log level does not match the logger's
-    if(!is_loggable(log_level))
+    if (!is_loggable(log_level))
     {
         return;
     }
@@ -129,7 +126,7 @@
 
 void Logger::decorate_log_msg(LogMsg &msg)
 {
-    for(const auto &d : _decorators)
+    for (const auto &d : _decorators)
     {
         d->decorate(msg);
     }
@@ -148,7 +145,7 @@
 
 void Logger::print_all(const std::string &msg)
 {
-    for(auto &p : _printers)
+    for (auto &p : _printers)
     {
         p->print(msg);
     }

diff --git a/src/core/utils/logging/LoggerRegistry.cpp b/src/core/utils/logging/LoggerRegistry.cpp
index c281d88..17015d9 100644
--- a/src/core/utils/logging/LoggerRegistry.cpp
+++ b/src/core/utils/logging/LoggerRegistry.cpp

@@ -24,15 +24,15 @@
 #include "arm_compute/core/utils/logging/LoggerRegistry.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/Mutex.h"
 
 using namespace arm_compute::logging;
 
 /** Reserved logger used by the library */
-std::set<std::string> LoggerRegistry::_reserved_loggers = { "CORE", "RUNTIME", "GRAPH" };
+std::set<std::string> LoggerRegistry::_reserved_loggers = {"CORE", "RUNTIME", "GRAPH"};
 
-LoggerRegistry::LoggerRegistry()
-    : _mtx(), _loggers()
+LoggerRegistry::LoggerRegistry() : _mtx(), _loggers()
 {
 }
 
@@ -42,10 +42,12 @@
     return _instance;
 }
 
-void LoggerRegistry::create_logger(const std::string &name, LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
+void LoggerRegistry::create_logger(const std::string                           &name,
+                                   LogLevel                                     log_level,
+                                   const std::vector<std::shared_ptr<Printer>> &printers)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    if((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
+    if ((_loggers.find(name) == _loggers.end()) && (_reserved_loggers.find(name) == _reserved_loggers.end()))
     {
         _loggers[name] = std::make_shared<Logger>(name, log_level, printers);
     }
@@ -54,7 +56,7 @@
 void LoggerRegistry::remove_logger(const std::string &name)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    if(_loggers.find(name) != _loggers.end())
+    if (_loggers.find(name) != _loggers.end())
     {
         _loggers.erase(name);
     }
@@ -69,9 +71,9 @@
 void LoggerRegistry::create_reserved_loggers(LogLevel log_level, const std::vector<std::shared_ptr<Printer>> &printers)
 {
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    for(const auto &r : _reserved_loggers)
+    for (const auto &r : _reserved_loggers)
     {
-        if(_loggers.find(r) == _loggers.end())
+        if (_loggers.find(r) == _loggers.end())
         {
             _loggers[r] = std::make_shared<Logger>(r, log_level, printers);
         }

diff --git a/src/core/utils/misc/MMappedFile.cpp b/src/core/utils/misc/MMappedFile.cpp
index adae8a2..a467cb3 100644
--- a/src/core/utils/misc/MMappedFile.cpp
+++ b/src/core/utils/misc/MMappedFile.cpp

@@ -27,12 +27,11 @@
 
 #include <cstdio>
 #include <cstring>
-#include <tuple>
-
 #include <fcntl.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <sys/types.h>
+#include <tuple>
 #include <unistd.h>
 
 namespace arm_compute
@@ -53,7 +52,7 @@
 {
     struct stat st; // NOLINT
     memset(&st, 0, sizeof(struct stat));
-    if(stat(filename.c_str(), &st) == 0)
+    if (stat(filename.c_str(), &st) == 0)
     {
         return std::make_pair(st.st_size, true);
     }
@@ -73,8 +72,7 @@
 }
 } // namespace
 
-MMappedFile::MMappedFile()
-    : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
+MMappedFile::MMappedFile() : _filename(), _file_size(0), _map_size(0), _map_offset(0), _fp(nullptr), _data(nullptr)
 {
 }
 
@@ -92,14 +90,14 @@
 bool MMappedFile::map(const std::string &filename, size_t size, size_t offset)
 {
     // Check if file is mapped
-    if(is_mapped())
+    if (is_mapped())
     {
         return false;
     }
 
     // Open file
     _fp = fopen(filename.c_str(), "a+be");
-    if(_fp == nullptr)
+    if (_fp == nullptr)
     {
         return false;
     }
@@ -107,26 +105,26 @@
     // Extract file descriptor
     int  fd     = fileno(_fp);
     bool status = fd >= 0;
-    if(status)
+    if (status)
     {
         // Get file size
         std::tie(_file_size, status) = get_file_size(_filename);
 
-        if(status)
+        if (status)
         {
             // Map all file from offset if map size is 0
             _map_size   = (size == 0) ? _file_size : size;
             _map_offset = offset;
 
             // Check offset mapping
-            if((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
+            if ((_map_offset > _file_size) || (_map_offset % get_page_size() != 0))
             {
                 status = false;
             }
             else
             {
                 // Truncate to file size
-                if(_map_offset + _map_size > _file_size)
+                if (_map_offset + _map_size > _file_size)
                 {
                     _map_size = _file_size - _map_offset;
                 }
@@ -137,7 +135,7 @@
         }
     }
 
-    if(!status)
+    if (!status)
     {
         fclose(_fp);
     }
@@ -148,14 +146,14 @@
 void MMappedFile::release()
 {
     // Unmap file
-    if(_data != nullptr)
+    if (_data != nullptr)
     {
         ::munmap(_data, _file_size);
         _data = nullptr;
     }
 
     // Close file
-    if(_fp != nullptr)
+    if (_fp != nullptr)
     {
         fclose(_fp);
         _fp = nullptr;

diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 086d63b..f66d3e7 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/utils/quantization/AsymmHelpers.h"
 #include "support/ToolchainSupport.h"
 
@@ -40,7 +42,7 @@
 
 Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon)
 {
-    if(multiplier >= 1.f)
+    if (multiplier >= 1.f)
     {
         Status status = calculate_quantized_multiplier_greater_than_one(multiplier, quant_multiplier, shift);
         *shift *= -1;
@@ -69,13 +71,13 @@
     *right_shift           = -1 * shift_exp;
     auto q_fixed           = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
     ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
-    if(q_fixed == fixed_point_one_Q0)
+    if (q_fixed == fixed_point_one_Q0)
     {
         q_fixed /= 2;
         --*right_shift;
     }
 
-    if(ignore_epsilon && *right_shift > 31)
+    if (ignore_epsilon && *right_shift > 31)
     {
         *right_shift = 0;
         q_fixed      = 0;
@@ -88,9 +90,8 @@
     return Status{};
 }
 
-Status calculate_quantized_multiplier_greater_than_one(float    multiplier,
-                                                       int32_t *quantized_multiplier,
-                                                       int32_t *left_shift)
+Status
+calculate_quantized_multiplier_greater_than_one(float multiplier, int32_t *quantized_multiplier, int32_t *left_shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(quantized_multiplier == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(left_shift == nullptr);
@@ -101,7 +102,7 @@
     *left_shift            = shift_exp;
     auto q_fixed           = static_cast<int64_t>(support::cpp11::round(q * fixed_point_one_Q0));
     ARM_COMPUTE_RETURN_ERROR_ON(q_fixed > fixed_point_one_Q0);
-    if(q_fixed == fixed_point_one_Q0)
+    if (q_fixed == fixed_point_one_Q0)
     {
         q_fixed /= 2;
         ++*left_shift;
@@ -113,9 +114,9 @@
     return Status{};
 }
 
-arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo &iq_info,
-                                                    const QuantizationInfo &wq_info,
-                                                    const QuantizationInfo &oq_info,
+arm_compute::Status calculate_quantized_multipliers(const QuantizationInfo  &iq_info,
+                                                    const QuantizationInfo  &wq_info,
+                                                    const QuantizationInfo  &oq_info,
                                                     GEMMLowpOutputStageInfo &stage_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(iq_info.scale().empty());
@@ -133,7 +134,7 @@
     const float i_scale  = iq_info.scale().at(0);
     const float o_scale  = oq_info.scale().at(0);
 
-    for(unsigned int i = 0; i < size; ++i)
+    for (unsigned int i = 0; i < size; ++i)
     {
         const float multiplier       = i_scale * w_scales[i] / o_scale;
         int32_t     quant_multiplier = 0;
@@ -154,7 +155,7 @@
 {
     int min_quant_val = 0;
     int max_quant_val = 0;
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
             min_quant_val = std::numeric_limits<uint8_t>::min();
@@ -179,7 +180,9 @@
     return std::make_pair(min_quant_val, max_quant_val);
 }
 
-std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type)
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo    &q_info,
+                                                                     const ActivationLayerInfo &act_info,
+                                                                     DataType                   data_type)
 {
     ARM_COMPUTE_ERROR_ON(data_type != DataType::QASYMM8 && data_type != DataType::QASYMM8_SIGNED);
 
@@ -190,20 +193,23 @@
 
     const UniformQuantizationInfo q_unif = q_info.uniform();
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
-        switch(act_info.activation())
+        switch (act_info.activation())
         {
             case ActivationLayerInfo::ActivationFunction::RELU:
                 type_min = q_unif.offset;
                 break;
             case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
                 type_min = q_unif.offset;
-                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info);
+                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.a(), q_info);
                 break;
             case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info) : quantize_qasymm8_signed(act_info.b(), q_info);
-                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info) : quantize_qasymm8_signed(act_info.a(), q_info);
+                type_min = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.b(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.b(), q_info);
+                type_max = (data_type == DataType::QASYMM8) ? quantize_qasymm8(act_info.a(), q_info)
+                                                            : quantize_qasymm8_signed(act_info.a(), q_info);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Activation function not supported.");
@@ -226,7 +232,7 @@
 
     const unsigned int num_filters = wq_info.scale().size();
 
-    for(unsigned int i = 0; i < num_filters; ++i)
+    for (unsigned int i = 0; i < num_filters; ++i)
     {
         int32_t     output_multiplier = 0;
         int32_t     output_shift      = 0;
@@ -267,11 +273,11 @@
 
 int32_t saturating_rounding_multiply_by_pow2(int32_t exponent, int32_t v)
 {
-    if(exponent == 0)
+    if (exponent == 0)
     {
         return v;
     }
-    else if(exponent < 0)
+    else if (exponent < 0)
     {
         return rounding_divide_by_pow2(v, -exponent);
     }
@@ -291,11 +297,14 @@
     }
 }
 
-void get_invsqrt_quantized_multiplier_exp(int32_t input, int32_t reverse_shift, int32_t &output_inv_sqrt, int32_t &output_shift)
+void get_invsqrt_quantized_multiplier_exp(int32_t  input,
+                                          int32_t  reverse_shift,
+                                          int32_t &output_inv_sqrt,
+                                          int32_t &output_shift)
 {
     ARM_COMPUTE_ERROR_ON(input < 0);
 
-    if(input <= 1)
+    if (input <= 1)
     {
         // dealing the inputs (0 and 1) separately to avoid overflow
         output_inv_sqrt = std::numeric_limits<std::int32_t>::max();
@@ -305,7 +314,7 @@
 
     // prepare input for fixed point operation and compute shift value
     output_shift = 11;
-    while(input >= (1 << 29))
+    while (input >= (1 << 29))
     {
         input /= 4;
         ++output_shift;
@@ -334,9 +343,7 @@
 
     // multiplication of two fixed point numbers, defined for readability
     auto fixed_point_mul = [](FixedPointRawType a, FixedPointRawType b) -> FixedPointRawType
-    {
-        return saturating_rounding_doubling_highmul(a, b);
-    };
+    { return saturating_rounding_doubling_highmul(a, b); };
 
     // rescaling of fixed point to have dst_bit integer bits, defined for readability
     auto fixed_point_rescale = [](FixedPointRawType a, uint32_t src_bit, uint32_t dst_bit) -> FixedPointRawType
@@ -347,17 +354,18 @@
 
     // 5 iterations of Newton-Raphson method for inverse square root - 1.5 * x_n = input/2 * (x_n)^3
     constexpr int32_t num_iteration = 5;
-    for(int32_t i = 0; i < num_iteration; ++i)
+    for (int32_t i = 0; i < num_iteration; ++i)
     {
         const auto x3 = fixed_point_rescale(fixed_point_mul(fixed_point_mul(x, x), x), 9, fixedpoint_position);
-        x             = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3), 6, fixedpoint_position);
+        x = fixed_point_rescale(fixed_point_mul(fixedpoint_half_three, x) - fixed_point_mul(fixedpoint_half_input, x3),
+                                6, fixedpoint_position);
     }
 
     // fixed point representation of sqrt(1/2)
     const FixedPoint0 fixedpoint_half_sqrt_2 = 1518500250;
     x                                        = fixed_point_mul(fixedpoint_half_sqrt_2, x);
     output_inv_sqrt                          = x;
-    if(output_shift < 0)
+    if (output_shift < 0)
     {
         output_inv_sqrt <<= -output_shift;
         output_shift = 0;
@@ -365,5 +373,5 @@
     // convert right shift to left shift
     output_shift *= reverse_shift;
 }
-} // quantization
-} // arm_compute
+} // namespace quantization
+} // namespace arm_compute

diff --git a/src/core/utils/quantization/AsymmHelpers.h b/src/core/utils/quantization/AsymmHelpers.h
index f970109..5dc607c 100644
--- a/src/core/utils/quantization/AsymmHelpers.h
+++ b/src/core/utils/quantization/AsymmHelpers.h

@@ -29,7 +29,8 @@
 
 namespace arm_compute
 {
-namespace quantization {
+namespace quantization
+{
 
 /** Get minimum and maximum output of the activation function after quantization.
  *
@@ -41,7 +42,9 @@
  *
  * @return The minimum and maximum output of the activation function after quantization.
  */
-std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo &q_info, const ActivationLayerInfo &act_info, DataType data_type);
+std::tuple<int32_t, int32_t> get_quantized_asymmetric_output_min_max(const QuantizationInfo    &q_info,
+                                                                     const ActivationLayerInfo &act_info,
+                                                                     DataType                   data_type);
 
 } // namespace quantization
 } // namespace arm_compute

diff --git a/src/cpu/CpuContext.cpp b/src/cpu/CpuContext.cpp
index 7c14891..b745af8 100644
--- a/src/cpu/CpuContext.cpp
+++ b/src/cpu/CpuContext.cpp

@@ -24,6 +24,7 @@
 #include "src/cpu/CpuContext.h"
 
 #include "arm_compute/core/CPP/CPPTypes.h"
+
 #include "src/cpu/CpuQueue.h"
 #include "src/cpu/CpuTensor.h"
 
@@ -32,7 +33,7 @@
 #include <malloc.h>
 
 #if defined(_WIN64)
-#define posix_memalign _aligned_realloc
+#define posix_memalign      _aligned_realloc
 #define posix_memalign_free _aligned_free
 #endif // defined(_WIN64)
 #endif // !defined(__APPLE__) && !defined(__OpenBSD__)
@@ -66,7 +67,7 @@
     size_t real_size = (rem) ? (size + alignment - rem) : size;
     ptr              = memalign(alignment, real_size);
 #else  /* defined(BARE_METAL) */
-    if(posix_memalign(&ptr, alignment, size) != 0)
+    if (posix_memalign(&ptr, alignment, size) != 0)
     {
         // posix_memalign returns non-zero on failures, the return values will be
         // - EINVAL: wrong alignment
@@ -81,17 +82,13 @@
     ARM_COMPUTE_UNUSED(user_data);
     free(ptr);
 }
-static AclAllocator default_allocator = { &default_allocate,
-                                          &default_free,
-                                          &default_aligned_allocate,
-                                          &default_aligned_free,
-                                          nullptr
-                                        };
+static AclAllocator default_allocator = {&default_allocate, &default_free, &default_aligned_allocate,
+                                         &default_aligned_free, nullptr};
 
 AllocatorWrapper populate_allocator(AclAllocator *external_allocator)
 {
     bool is_valid = (external_allocator != nullptr);
-    if(is_valid)
+    if (is_valid)
     {
         is_valid = is_valid && (external_allocator->alloc != nullptr);
         is_valid = is_valid && (external_allocator->free != nullptr);
@@ -123,14 +120,13 @@
     return isa_caps;
 }
 
-CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps,
-                                      int32_t               max_threads)
+CpuCapabilities populate_capabilities(AclTargetCapabilities external_caps, int32_t max_threads)
 {
     CpuCapabilities caps;
 
     // Populate capabilities with system information
     caps.cpu_info = cpuinfo::CpuInfo::build();
-    if(external_caps != AclCpuCapabilitiesAuto)
+    if (external_caps != AclCpuCapabilitiesAuto)
     {
         cpuinfo::CpuIsaInfo isa  = populate_capabilities_flags(external_caps);
         auto                cpus = caps.cpu_info.cpus();
@@ -151,11 +147,9 @@
 } // namespace
 
 CpuContext::CpuContext(const AclContextOptions *options)
-    : IContext(Target::Cpu),
-      _allocator(default_allocator),
-      _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1))
+    : IContext(Target::Cpu), _allocator(default_allocator), _caps(populate_capabilities(AclCpuCapabilitiesAuto, -1))
 {
-    if(options != nullptr)
+    if (options != nullptr)
     {
         _allocator = populate_allocator(options->allocator);
         _caps      = populate_capabilities(options->capabilities, options->max_compute_units);
@@ -175,7 +169,7 @@
 ITensorV2 *CpuContext::create_tensor(const AclTensorDescriptor &desc, bool allocate)
 {
     CpuTensor *tensor = new CpuTensor(this, desc);
-    if(tensor != nullptr && allocate)
+    if (tensor != nullptr && allocate)
     {
         tensor->allocate();
     }

diff --git a/src/cpu/CpuContext.h b/src/cpu/CpuContext.h
index da241ed..0c8ae49 100644
--- a/src/cpu/CpuContext.h
+++ b/src/cpu/CpuContext.h

@@ -25,8 +25,8 @@
 #define SRC_CPU_CPUCONTEXT_H
 
 #include "src/common/AllocatorWrapper.h"
-#include "src/common/IContext.h"
 #include "src/common/cpuinfo/CpuInfo.h"
+#include "src/common/IContext.h"
 
 namespace arm_compute
 {
@@ -36,7 +36,7 @@
 struct CpuCapabilities
 {
     cpuinfo::CpuInfo cpu_info{};
-    int32_t          max_threads{ -1 };
+    int32_t          max_threads{-1};
 };
 
 /** CPU context implementation class */
@@ -60,9 +60,9 @@
     AllocatorWrapper &allocator();
 
     // Inherrited methods overridden
-    ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
-    IQueue *create_queue(const AclQueueOptions *options) override;
-    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    ITensorV2                          *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue                             *create_queue(const AclQueueOptions *options) override;
+    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                           const AclTensorDescriptor     &dst,
                                                           const AclActivationDescriptor &act,
                                                           bool                           is_validate) override;
@@ -74,4 +74,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CPU_CPUCONTEXT_H */
\ No newline at end of file
+#endif /* SRC_CPU_CPUCONTEXT_H */

diff --git a/src/cpu/CpuQueue.cpp b/src/cpu/CpuQueue.cpp
index 0f0097b..be781d6 100644
--- a/src/cpu/CpuQueue.cpp
+++ b/src/cpu/CpuQueue.cpp

@@ -29,8 +29,7 @@
 {
 namespace cpu
 {
-CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options)
-    : IQueue(ctx)
+CpuQueue::CpuQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx)
 {
     ARM_COMPUTE_UNUSED(options);
 }

diff --git a/src/cpu/CpuQueue.h b/src/cpu/CpuQueue.h
index 871a36c..b6a2be0 100644
--- a/src/cpu/CpuQueue.h
+++ b/src/cpu/CpuQueue.h

@@ -24,10 +24,10 @@
 #ifndef SRC_CPU_CPUQUEUE_H
 #define SRC_CPU_CPUQUEUE_H
 
-#include "src/common/IQueue.h"
-
 #include "arm_compute/runtime/IScheduler.h"
 
+#include "src/common/IQueue.h"
+
 namespace arm_compute
 {
 namespace cpu

diff --git a/src/cpu/CpuTensor.cpp b/src/cpu/CpuTensor.cpp
index 6dd6d9c..59082b5 100644
--- a/src/cpu/CpuTensor.cpp
+++ b/src/cpu/CpuTensor.cpp

@@ -29,8 +29,7 @@
 {
 namespace cpu
 {
-CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc)
-    : ITensorV2(ctx), _legacy_tensor()
+CpuTensor::CpuTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor()
 {
     ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::Cpu));
     _legacy_tensor = std::make_unique<Tensor>();
@@ -41,7 +40,7 @@
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[CpuTensor:map]: Backing tensor does not exist!");
         return nullptr;

diff --git a/src/cpu/CpuTensor.h b/src/cpu/CpuTensor.h
index b078774..89931e1 100644
--- a/src/cpu/CpuTensor.h
+++ b/src/cpu/CpuTensor.h

@@ -24,10 +24,10 @@
 #ifndef SRC_CPU_CPUTENSOR_H
 #define SRC_CPU_CPUTENSOR_H
 
-#include "src/common/ITensorV2.h"
-
 #include "arm_compute/runtime/Tensor.h"
 
+#include "src/common/ITensorV2.h"
+
 namespace arm_compute
 {
 namespace cpu
@@ -52,7 +52,7 @@
     void                 *map() override;
     StatusCode            unmap() override;
     arm_compute::ITensor *tensor() const override;
-    StatusCode import(void *handle, ImportMemoryType type) override;
+    StatusCode            import(void *handle, ImportMemoryType type) override;
 
 private:
     std::unique_ptr<Tensor> _legacy_tensor;
@@ -60,4 +60,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CPU_CPUTENSOR_H */
\ No newline at end of file
+#endif /* SRC_CPU_CPUTENSOR_H */

diff --git a/src/cpu/CpuTypes.h b/src/cpu/CpuTypes.h
index 0f7b9b6..8726bc4 100644
--- a/src/cpu/CpuTypes.h
+++ b/src/cpu/CpuTypes.h

@@ -31,6 +31,6 @@
 typedef __fp16 float16_t;
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 typedef float float32_t;
-}
+} // namespace arm_compute
 
 #endif /* ARM_COMPUTE_CPUTYPES */

diff --git a/src/cpu/ICpuKernel.h b/src/cpu/ICpuKernel.h
index 8f41062..bcd0cb2 100644
--- a/src/cpu/ICpuKernel.h
+++ b/src/cpu/ICpuKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_ICPUKERNEL_H
 
 #include "arm_compute/core/CPP/ICPPKernel.h"
+
 #include "src/cpu/kernels/CpuKernelSelectionTypes.h"
 
 namespace arm_compute
@@ -34,7 +35,7 @@
 enum class KernelSelectionType
 {
     Preferred, /**< Retrieve the best implementation available for the given Cpu ISA, ignoring the build flags */
-    Supported  /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
+    Supported /**< Retrieve the best implementation available for the given Cpu ISA that is supported by the current build */
 };
 
 template <class Derived>
@@ -50,13 +51,15 @@
      */
 
     template <typename SelectorType>
-    static const auto *get_implementation(const SelectorType &selector, KernelSelectionType selection_type = KernelSelectionType::Supported)
+    static const auto *get_implementation(const SelectorType &selector,
+                                          KernelSelectionType selection_type = KernelSelectionType::Supported)
     {
-        using kernel_type = typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type;
+        using kernel_type =
+            typename std::remove_reference<decltype(Derived::get_available_kernels())>::type::value_type;
 
-        for(const auto &uk : Derived::get_available_kernels())
+        for (const auto &uk : Derived::get_available_kernels())
         {
-            if(uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr))
+            if (uk.is_selected(selector) && (selection_type == KernelSelectionType::Preferred || uk.ukernel != nullptr))
             {
                 return &uk;
             }

diff --git a/src/cpu/kernels/CpuActivationKernel.cpp b/src/cpu/kernels/CpuActivationKernel.cpp
index f4bd4e6..50bf672 100644
--- a/src/cpu/kernels/CpuActivationKernel.cpp
+++ b/src/cpu/kernels/CpuActivationKernel.cpp

@@ -26,11 +26,11 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/activation/list.h"
 
 #include <array>
@@ -43,126 +43,126 @@
 {
 namespace
 {
-static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels =
-{
+static const std::vector<CpuActivationKernel::ActivationKernel> available_kernels = {
 #ifdef ARM_COMPUTE_ENABLE_SVE
-    {
-        "sve2_q8_activation_lut",
-        [](const ActivationDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.cpumodel == CPUModel::A510 && data.isa.sve2; },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)
-    },
+    {"sve2_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) &&
+                data.cpumodel == CPUModel::A510 && data.isa.sve2;
+     },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_q8_activation_lut)},
 #endif // ARM_COMPUTE_ENABLE_SVE
 #ifdef __aarch64__
-    {
-        // Neon LUT implementantion takes precedence
-        "neon_q8_activation_lut",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)
-    },
+    {// Neon LUT implementantion takes precedence
+     "neon_q8_activation_lut",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_Q8_NEON(arm_compute::cpu::neon_q8_activation_lut)},
 #endif // __aarch64__
-    {
-        "sve2_qu8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)
-    },
-    {
-        "sve2_qs8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)
-    },
-    {
-        "sve2_qs16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16 && data.isa.sve2 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)
-    },
-    {
-        "sve_fp16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)
-    },
-    {
-        "sve_fp32_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
-        REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)
-    },
-    {
-        "neon_fp16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)
-    },
-    {
-        "neon_fp32_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)
-    },
-    {
-        "neon_qu8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)
-    },
-    {
-        "neon_qs8_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)
-    },
-    {
-        "neon_qs16_activation",
-        [](const ActivationDataTypeISASelectorData & data) { return data.dt == DataType::QSYMM16; },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)
-    },
+    {"sve2_qu8_activation",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::sve2_qasymm8_activation)},
+    {"sve2_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::sve2_qasymm8_signed_activation)},
+    {"sve2_qs16_activation",
+     [](const ActivationDataTypeISASelectorData &data) {
+         return data.dt == DataType::QSYMM16 && data.isa.sve2 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_QSYMM16_SVE2(arm_compute::cpu::sve2_qsymm16_activation)},
+    {"sve_fp16_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                data.f != ActivationLayerInfo::ActivationFunction::GELU;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::sve_fp16_activation)},
+    {"sve_fp32_activation",
+     [](const ActivationDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && data.f != ActivationLayerInfo::ActivationFunction::GELU; },
+     REGISTER_FP32_SVE(arm_compute::cpu::sve_fp32_activation)},
+    {"neon_fp16_activation",
+     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_activation)},
+    {"neon_fp32_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_activation)},
+    {"neon_qu8_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_activation)},
+    {"neon_qs8_activation",
+     [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_activation)},
+    {"neon_qs16_activation", [](const ActivationDataTypeISASelectorData &data) { return data.dt == DataType::QSYMM16; },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::neon_qsymm16_activation)},
 };
 
 /* Supported activation in the 8-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations =
-{
-    ActivationLayerInfo::ActivationFunction::RELU,
-    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
-    ActivationLayerInfo::ActivationFunction::GELU,
+static const std::array<ActivationLayerInfo::ActivationFunction, 8> qasymm8_activations = {
+    ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+    ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+    ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+    ActivationLayerInfo::ActivationFunction::LEAKY_RELU,   ActivationLayerInfo::ActivationFunction::GELU,
 };
 /* Supported activation in the 16-bit integer domain */
-static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations =
-{
-    ActivationLayerInfo::ActivationFunction::LOGISTIC,
-    ActivationLayerInfo::ActivationFunction::TANH,
-    ActivationLayerInfo::ActivationFunction::HARD_SWISH,
-    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-};
+static const std::array<ActivationLayerInfo::ActivationFunction, 4> qsymm16_activations = {
+    ActivationLayerInfo::ActivationFunction::LOGISTIC, ActivationLayerInfo::ActivationFunction::TANH,
+    ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &activation_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
 
-    const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() });
+    const auto *uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = activation_info.activation();
+    const DataType          data_type = src->data_type();
+    const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act = activation_info.activation();
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_asymmetric(data_type) && (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) == std::end(qasymm8_activations)),
-                                    "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_data_type_quantized_asymmetric(data_type) &&
+            (std::find(std::begin(qasymm8_activations), std::end(qasymm8_activations), f_act) ==
+             std::end(qasymm8_activations)),
+        "For QASYMM8 only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) && (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations), f_act) == std::end(qsymm16_activations)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized_symmetric(data_type) &&
+                                        (std::find(std::begin(qsymm16_activations), std::end(qsymm16_activations),
+                                                   f_act) == std::end(qsymm16_activations)),
                                     "For QSYMM16 only tanh and logistic are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-                                && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-                                && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON((data_type == DataType::QASYMM8 || data_type == DataType::QASYMM16) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, 0)));
 
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, -128)));
 
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
 
     // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -176,7 +176,7 @@
     // Configure kernel window
     Window win = calculate_max_window(*src, Steps());
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // dst auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -185,14 +185,19 @@
     return std::make_pair(Status{}, win);
 }
 #ifdef __aarch64__
-void init_lut(ActivationLayerInfo::ActivationFunction act_func, DataType data_type,
-              const UniformQuantizationInfo &qi_in, const UniformQuantizationInfo &qi_out,
-              ActivationLayerInfo::LookupTable256 &lut, float a, float b)
+void init_lut(ActivationLayerInfo::ActivationFunction act_func,
+              DataType                                data_type,
+              const UniformQuantizationInfo          &qi_in,
+              const UniformQuantizationInfo          &qi_out,
+              ActivationLayerInfo::LookupTable256    &lut,
+              float                                   a,
+              float                                   b)
 {
-    for(size_t i = 0; i < lut.size(); ++i)
+    for (size_t i = 0; i < lut.size(); ++i)
     {
-        float tmp_f = (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in);
-        switch(act_func)
+        float tmp_f =
+            (data_type == DataType::QASYMM8) ? dequantize_qasymm8(i, qi_in) : dequantize_qasymm8_signed(i, qi_in);
+        switch (act_func)
         {
             case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
                 tmp_f = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
@@ -246,7 +251,8 @@
                 tmp_f = 0;
                 break;
         }
-        lut[i] = (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out);
+        lut[i] =
+            (data_type == DataType::QASYMM8) ? quantize_qasymm8(tmp_f, qi_out) : quantize_qasymm8_signed(tmp_f, qi_out);
     }
 }
 #endif // __aarch64__
@@ -258,8 +264,9 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, activation_info));
 
-    const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation() });
-    if(dst != nullptr)
+    const auto uk = CpuActivationKernel::get_implementation(ActivationDataTypeISASelectorData{
+        src->data_type(), CPUInfo::get().get_cpu_model(), CPUInfo::get().get_isa(), activation_info.activation()});
+    if (dst != nullptr)
     {
         // dst auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -271,11 +278,12 @@
     _name       = std::string("CpuActivationKernel").append("/").append(uk->name);
 
 #ifdef __aarch64__
-    if(src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
+    if (src->data_type() == DataType::QASYMM8 || src->data_type() == DataType::QASYMM8_SIGNED)
     {
         ActivationLayerInfo::LookupTable256 tmp_lut;
-        init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(), (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(),
-                 tmp_lut, activation_info.a(), activation_info.b());
+        init_lut(activation_info.activation(), src->data_type(), src->quantization_info().uniform(),
+                 (dst) ? dst->quantization_info().uniform() : src->quantization_info().uniform(), tmp_lut,
+                 activation_info.a(), activation_info.b());
         activation_info.setLookupTable256(tmp_lut);
     }
 #endif // __aarch64__
@@ -288,11 +296,13 @@
     ICPPKernel::configure(win);
 }
 
-Status CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status
+CpuActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), (dst != nullptr) ? dst->clone().get() : nullptr).first);
 
     return Status{};
 }
@@ -302,7 +312,7 @@
     ARM_COMPUTE_UNUSED(thread_count);
     ARM_COMPUTE_UNUSED(platform);
 
-    if(_split_dimension == Window::DimX)
+    if (_split_dimension == Window::DimX)
     {
         // Don't split the work load too small if the tensor has been reinterpreted as 1D.
         // This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -314,7 +324,7 @@
 void CpuActivationKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     // Early exit on disabled activation
-    if(!_act_info.enabled())
+    if (!_act_info.enabled())
     {
         return;
     }

diff --git a/src/cpu/kernels/CpuActivationKernel.h b/src/cpu/kernels/CpuActivationKernel.h
index 8044076..4bad9fb 100644
--- a/src/cpu/kernels/CpuActivationKernel.h
+++ b/src/cpu/kernels/CpuActivationKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ACTIVATION_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -38,7 +39,8 @@
 class CpuActivationKernel : public ICpuKernel<CpuActivationKernel>
 {
 private:
-    using ActivationKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
+    using ActivationKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const ActivationLayerInfo &, const Window &)>::type;
 
 public:
     CpuActivationKernel() = default;
@@ -71,7 +73,7 @@
     size_t get_mws(const CPUInfo &platform, size_t thread_count) const override;
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Get the preferred dimension in which the scheduler splits the work into multiple jobs.
@@ -94,8 +96,8 @@
 
 private:
     ActivationLayerInfo _act_info{};
-    ActivationKernelPtr _run_method{ nullptr };
-    size_t              _split_dimension{ Window::DimY };
+    ActivationKernelPtr _run_method{nullptr};
+    size_t              _split_dimension{Window::DimY};
     std::string         _name{};
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuAddKernel.cpp b/src/cpu/kernels/CpuAddKernel.cpp
index 2983575..a990aa4 100644
--- a/src/cpu/kernels/CpuAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddKernel.cpp

@@ -26,19 +26,21 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/add/list.h"
+
 #include <array>
 
 #if defined(ENABLE_FP32_KERNELS)
 namespace
 {
-    static constexpr size_t default_mws_N1_fp32_neon = 24536;
-    static constexpr size_t default_mws_V1_fp32_neon = 40510;
-}
+static constexpr size_t default_mws_N1_fp32_neon = 24536;
+static constexpr size_t default_mws_V1_fp32_neon = 40510;
+} // namespace
 #endif /* ENABLE_FP32_KERNELS */
 
 namespace arm_compute
@@ -49,152 +51,82 @@
 {
 namespace
 {
-static const std::vector<CpuAddKernel::AddKernel> available_kernels =
-{
-    {
-        "neon_qu8_add_fixedpoint",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)
-    },
-    {
-        "neon_qs8_add_fixedpoint",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint;
-        },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)
-    },
-    {
-        "sve2_qu8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8) && data.isa.sve2;
-        },
-        REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)
-    },
-    {
-        "sve2_qs8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)
-    },
-    {
-        "sve2_qs16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QSYMM16) && data.isa.sve2;
-        },
-        REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)
-    },
-    {
-        "sve_fp32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32) && data.isa.sve;
-        },
-        REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)
-    },
-    {
-        "sve_fp16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16;
-        },
-        REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)
-    },
-    {
-        "sve_u8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::U8) && data.isa.sve;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)
-    },
-    {
-        "sve_s16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::S16) && data.isa.sve;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)
-    },
-    {
-        "sve_s32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::S32) && data.isa.sve;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)
-    },
-    {
-        "neon_fp32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)
-    },
-    {
-        "neon_fp16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.fp16;
-        },
-        REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)
-    },
-    {
-        "neon_u8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)
-    },
-    {
-        "neon_s16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)
-    },
-    {
-        "neon_s32_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)
-    },
-    {
-        "neon_qu8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)
-    },
-    {
-        "neon_qs8_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)
-    },
-    {
-        "neon_qs16_add",
-        [](const CpuAddKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)
-    }
-};
+static const std::vector<CpuAddKernel::AddKernel> available_kernels = {
+    {"neon_qu8_add_fixedpoint",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8) && data.can_use_fixedpoint; },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<uint8_t>)},
+    {"neon_qs8_add_fixedpoint",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint; },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_q8_neon_fixedpoint<int8_t>)},
+    {"sve2_qu8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+     REGISTER_QASYMM8_SVE2(arm_compute::cpu::add_qasymm8_sve2)},
+    {"sve2_qs8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+     REGISTER_QASYMM8_SIGNED_SVE2(arm_compute::cpu::add_qasymm8_signed_sve2)},
+    {"sve2_qs16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16) && data.isa.sve2; },
+     REGISTER_QSYMM16_SVE2(arm_compute::cpu::add_qsymm16_sve2)},
+    {"sve_fp32_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(arm_compute::cpu::add_fp32_sve)},
+    {"sve_fp16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data)
+     { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(arm_compute::cpu::add_fp16_sve)},
+    {"sve_u8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_u8_sve)},
+    {"sve_s16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_s16_sve)},
+    {"sve_s32_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32) && data.isa.sve; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::add_s32_sve)},
+    {"neon_fp32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_fp32_neon)},
+    {"neon_fp16_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::add_fp16_neon)},
+    {"neon_u8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_u8_neon)},
+    {"neon_s16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_s16_neon)},
+    {"neon_s32_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::add_s32_neon)},
+    {"neon_qu8_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::add_qasymm8_neon)},
+    {"neon_qs8_add",
+     [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_qasymm8_signed_neon)},
+    {"neon_qs16_add", [](const CpuAddKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::add_qsymm16_neon)}};
 
-Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
     const TensorShape out_shape = TensorShape::broadcast_shape(src0.tensor_shape(), src1.tensor_shape());
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src0.tensor_shape().x() != src1.tensor_shape().x()) && ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type())
-                                                                                             || (src1.data_type() != dst.data_type())),
-                                    "Broadcasting across width is supported on configurations where all tensors have the same data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (src0.tensor_shape().x() != src1.tensor_shape().x()) &&
+            ((src0.data_type() != src1.data_type()) || (src0.data_type() != dst.data_type()) ||
+             (src1.data_type() != dst.data_type())),
+        "Broadcasting across width is supported on configurations where all tensors have the same data type");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
@@ -202,8 +134,8 @@
     }
 
     const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
-    const auto uk = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0.data_type(),
-                                                                                          CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+        CpuAddKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -215,9 +147,9 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst, policy));
 
-    const auto can_use_fixedpoint     = add_q8_neon_fixedpoint_possible(src0, src1, dst);
-    const auto uk                     = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(CpuAddKernelDataTypeISASelectorData{ src0->data_type(),
-                                                                                                              CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto can_use_fixedpoint = add_q8_neon_fixedpoint_possible(src0, src1, dst);
+    const auto uk                 = CpuAddKernel::get_implementation<CpuAddKernelDataTypeISASelectorData>(
+        CpuAddKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -237,7 +169,8 @@
     ICpuKernel::configure(win);
 }
 
-Status CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+Status
+CpuAddKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -277,14 +210,14 @@
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &add_fp32_neon)
+    if (this->_run_method == &add_fp32_neon)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_mws_V1_fp32_neon;
         }
@@ -294,7 +227,7 @@
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -307,7 +240,7 @@
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;

diff --git a/src/cpu/kernels/CpuAddKernel.h b/src/cpu/kernels/CpuAddKernel.h
index 9921fea..4adba8b 100644
--- a/src/cpu/kernels/CpuAddKernel.h
+++ b/src/cpu/kernels/CpuAddKernel.h

@@ -37,7 +37,8 @@
 class CpuAddKernel : public ICpuKernel<CpuAddKernel>
 {
 private:
-    using AddKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+    using AddKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
 
 public:
     struct AddKernel
@@ -74,10 +75,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Return minimum workload size of the relevant kernel
@@ -98,9 +100,9 @@
 
 private:
     ConvertPolicy _policy{};
-    AddKernelPtr  _run_method{ nullptr };
+    AddKernelPtr  _run_method{nullptr};
     std::string   _name{};
-    size_t        _split_dimension{ Window::DimY };
+    size_t        _split_dimension{Window::DimY};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuAddMulAddKernel.cpp b/src/cpu/kernels/CpuAddMulAddKernel.cpp
index b84bdd5..6a632e8 100644
--- a/src/cpu/kernels/CpuAddMulAddKernel.cpp
+++ b/src/cpu/kernels/CpuAddMulAddKernel.cpp

@@ -27,8 +27,8 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 
-#include "src/core/CPP/Validate.h"
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/addmuladd/list.h"
@@ -41,36 +41,28 @@
 {
 namespace
 {
-static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels =
-{
+static const std::vector<CpuAddMulAddKernel::AddMulAddKernel> available_kernels = {
 #ifdef __aarch64__
-    {
-        "neon_fp32_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)
-    },
-    {
-        "neon_fp16_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)
-    },
-    {
-        "neon_qasymm8_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)
-    },
-    {
-        "neon_qasymm8_signed_add_mul_add",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)
-    }
+    {"neon_fp32_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::add_mul_add_fp32_neon)},
+    {"neon_fp16_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16); },
+     REGISTER_FP16_NEON(arm_compute::cpu::add_mul_add_fp16_neon)},
+    {"neon_qasymm8_add_mul_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::add_mul_add_u8_neon)},
+    {"neon_qasymm8_signed_add_mul_add",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::add_mul_add_s8_neon)}
 #endif // __aarch64__
 };
 
-Status validate_arguments(const ITensorInfo *input1, const ITensorInfo *input2,
-                          const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                          const ITensorInfo *add_output, const ITensorInfo *final_output,
-                          ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *input1,
+                          const ITensorInfo         *input2,
+                          const ITensorInfo         *bn_mul,
+                          const ITensorInfo         *bn_add,
+                          const ITensorInfo         *add_output,
+                          const ITensorInfo         *final_output,
+                          ConvertPolicy              policy,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
 
@@ -78,16 +70,16 @@
 
     using ActFunction          = ActivationLayerInfo::ActivationFunction;
     const ActFunction act_func = act_info.activation();
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        (act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU && act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY),
-        "Only RELU Family activations, or no activation, is supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_func != ActFunction::BOUNDED_RELU && act_func != ActFunction::RELU &&
+                                     act_func != ActFunction::LU_BOUNDED_RELU && act_func != ActFunction::IDENTITY),
+                                    "Only RELU Family activations, or no activation, is supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
                                                          DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2);
 
-    if(is_data_type_quantized(input1->data_type()))
+    if (is_data_type_quantized(input1->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_mul, 1, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bn_add, 1, DataType::F32);
@@ -101,39 +93,47 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, input2); // No broadcasting
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(bn_mul, bn_add);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->num_dimensions() != 1, "BatchNorm coefficients should be 1D array");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0], "First dimensions of inputs and batchNorm coefs should match");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(bn_mul->tensor_shape()[0] != input1->tensor_shape()[0],
+                                    "First dimensions of inputs and batchNorm coefs should match");
 
     // Validate in case we have add layer's output (intermediate) initialized
-    if(add_output != nullptr && add_output->total_size() > 0)
+    if (add_output != nullptr && add_output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, add_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, add_output);
     }
 
     // Validate in case final output has been initialized
-    if(final_output->total_size() > 0)
+    if (final_output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input1, final_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, final_output);
     }
 
-    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+        DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
 }
 } // namespace
 
-void CpuAddMulAddKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                                   const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                                   ITensorInfo *add_output, ITensorInfo *final_output,
-                                   ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAddMulAddKernel::configure(const ITensorInfo         *input1,
+                                   const ITensorInfo         *input2,
+                                   const ITensorInfo         *bn_mul,
+                                   const ITensorInfo         *bn_add,
+                                   ITensorInfo               *add_output,
+                                   ITensorInfo               *final_output,
+                                   ConvertPolicy              policy,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(bn_mul, bn_add, input2);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, bn_add, bn_mul, final_output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
 
-    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(DataTypeISASelectorData{ input1->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuAddMulAddKernel::get_implementation<DataTypeISASelectorData>(
+        DataTypeISASelectorData{input1->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
 
@@ -146,7 +146,7 @@
     set_shape_if_empty(*final_output, input1->tensor_shape());
     set_data_type_if_unknown(*final_output, input1->data_type());
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         set_shape_if_empty(*add_output, input1->tensor_shape());
         set_data_type_if_unknown(*add_output, input1->data_type());
@@ -158,14 +158,19 @@
     ICpuKernel::configure(win);
 }
 
-Status CpuAddMulAddKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                                    const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                                    const ITensorInfo *add_output, const ITensorInfo *final_output,
-                                    ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAddMulAddKernel::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *bn_mul,
+                                    const ITensorInfo         *bn_add,
+                                    const ITensorInfo         *add_output,
+                                    const ITensorInfo         *final_output,
+                                    ConvertPolicy              policy,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, bn_mul, bn_add, final_output);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info));
 
     return Status{};
 }

diff --git a/src/cpu/kernels/CpuAddMulAddKernel.h b/src/cpu/kernels/CpuAddMulAddKernel.h
index 67ce6f0..c5e31ec 100644
--- a/src/cpu/kernels/CpuAddMulAddKernel.h
+++ b/src/cpu/kernels/CpuAddMulAddKernel.h

@@ -26,6 +26,7 @@
 #define SRC_CPU_KERNELS_CPUADDMULADDKERNEL
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,8 +40,15 @@
 class CpuAddMulAddKernel : public ICpuKernel<CpuAddMulAddKernel>
 {
 private:
-    using AddMulAddKernelPtr =
-        std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, const ITensor *, ITensor *, ITensor *, ConvertPolicy, const ActivationLayerInfo &, const Window &)>::type;
+    using AddMulAddKernelPtr = std::add_pointer<void(const ITensor *,
+                                                     const ITensor *,
+                                                     const ITensor *,
+                                                     const ITensor *,
+                                                     ITensor *,
+                                                     ITensor *,
+                                                     ConvertPolicy,
+                                                     const ActivationLayerInfo &,
+                                                     const Window &)>::type;
 
 public:
     struct AddMulAddKernel
@@ -57,23 +65,31 @@
      * Similar to @ref NEAddMulAdd::configure()
      *
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                   const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                   ITensorInfo *add_output, ITensorInfo *final_output,
-                   ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    void configure(const ITensorInfo         *input1,
+                   const ITensorInfo         *input2,
+                   const ITensorInfo         *bn_mul,
+                   const ITensorInfo         *bn_add,
+                   ITensorInfo               *add_output,
+                   ITensorInfo               *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuAddMulAddKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                           const ITensorInfo *add_output, const ITensorInfo *final_output,
-                           ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     static const std::vector<AddMulAddKernel> &get_available_kernels();
@@ -81,7 +97,7 @@
 private:
     ConvertPolicy       _policy{};
     ActivationLayerInfo _act_info{};
-    AddMulAddKernelPtr  _run_method{ nullptr };
+    AddMulAddKernelPtr  _run_method{nullptr};
     std::string         _name{};
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuCastKernel.cpp b/src/cpu/kernels/CpuCastKernel.cpp
index 764a1ec..05c7742 100644
--- a/src/cpu/kernels/CpuCastKernel.cpp
+++ b/src/cpu/kernels/CpuCastKernel.cpp

@@ -28,16 +28,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/common/Registrars.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-#include "support/SaturateCast.h"
-
 #include "src/cpu/kernels/cast/list.h"
+#include "support/SaturateCast.h"
 
 namespace arm_compute
 {
@@ -47,38 +47,30 @@
 {
 namespace
 {
-static const std::vector<CpuCastKernel::CastKernel> available_kernels =
-{
-    {
-        "neon_qs8_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)
-    },
-    {
-        "neon_qu8_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
-    },
-    {
-        "neon_u8_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)
-    },
-    {
-        "neon_fp16_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)
-    },
-    {
-        "neon_fp32_to_fp16_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)
-    },
-    {
-        "neon_s32_cast",
-        [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)
-    },
+static const std::vector<CpuCastKernel::CastKernel> available_kernels = {
+    {"neon_qs8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_qasymm8_signed_to_fp16_cast)},
+    {"neon_qu8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+    {"neon_u8_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_u8_to_fp16_cast)},
+    {"neon_fp16_cast",
+     [](const CastDataTypeISASelectorData &data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_to_other_dt_cast)},
+    {"neon_fp32_to_fp16_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp32_to_fp16_cast)},
+    {"neon_s32_cast",
+     [](const CastDataTypeISASelectorData &data)
+     { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_s32_to_fp16_cast)},
 };
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
@@ -88,57 +80,67 @@
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
 #ifdef __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::F32, DataType::S32, DataType::S64, DataType::U64);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::U32, DataType::S32, DataType::F32, DataType::S64);
 
 #else  // __aarch64__
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::F32, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::U8,
-                                                         DataType::S16, DataType::U16, DataType::F16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::U8, DataType::S16, DataType::U16, DataType::F16,
                                                          DataType::U32, DataType::S32, DataType::F32);
 #endif // __aarch64__
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
-                                                                                     && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32 &&
+                                         dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                              && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32),
                                     "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
-                                                                         && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 &&
+                                        (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32),
                                     "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 &&
+                                        (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
                                     "Only data_types supported [in] U16 ->  [out] U8, U32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
                                     "Only data_types supported [in] S16 ->  [out] U8, S32");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::U8
-                                                                          && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::U8 &&
+                                         dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
                                     "Only data_types supported [in] F16 ->  [out] QASYMM8, F32, S32, U8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16
-                                                                          && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
                                     "Only data_types supported [in] F32 ->  [out] QASYMM8, F16, S32, U8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
-                                                                          && dst->data_type() != DataType::F16
-                                                                          && dst->data_type() != DataType::F32
-                                                                          && dst->data_type() != DataType::U8
-                                                                          && dst->data_type() != DataType::S64),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 &&
+                                        (dst->data_type() != DataType::QASYMM8_SIGNED &&
+                                         dst->data_type() != DataType::QASYMM8 && dst->data_type() != DataType::F16 &&
+                                         dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8 &&
+                                         dst->data_type() != DataType::S64),
                                     "Only data_types supported [in] S32 ->  [out] QASYMM8, F16, F32, U8, S64");
 #ifdef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S64 && dst->data_type() != DataType::F32,
@@ -149,7 +151,7 @@
 #endif // __aarch64__
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
@@ -193,15 +195,8 @@
 template <>
 inline void internal_neon_convert<int32_t, int64_t>(const int32_t *src_ptr, int64_t *dst_ptr)
 {
-    const int32x4x4_t texels =
-    {
-        {
-            vld1q_s32(src_ptr),
-            vld1q_s32(src_ptr + 4),
-            vld1q_s32(src_ptr + 8),
-            vld1q_s32(src_ptr + 12)
-        }
-    };
+    const int32x4x4_t texels = {
+        {vld1q_s32(src_ptr), vld1q_s32(src_ptr + 4), vld1q_s32(src_ptr + 8), vld1q_s32(src_ptr + 12)}};
     vst1q_s64(dst_ptr, vmovl_s32(vget_low_s32(texels.val[0])));
     vst1q_s64(dst_ptr + 2, vmovl_s32(vget_high_s32(texels.val[0])));
     vst1q_s64(dst_ptr + 4, vmovl_s32(vget_low_s32(texels.val[1])));
@@ -215,33 +210,14 @@
 template <>
 inline void internal_neon_convert<int64_t, float>(const int64_t *src_ptr, float *dst_ptr)
 {
-    const float64x2x4_t texels0 =
-    {
-        {
-            vcvtq_f64_s64(vld1q_s64(src_ptr)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 4)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 6))
-        }
-    };
-    const float64x2x4_t texels1 =
-    {
-        {
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 8)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 12)),
-            vcvtq_f64_s64(vld1q_s64(src_ptr + 14))
-        }
-    };
-    const float32x4x4_t texels =
-    {
-        {
-            vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
-        }
-    };
+    const float64x2x4_t texels0 = {{vcvtq_f64_s64(vld1q_s64(src_ptr)), vcvtq_f64_s64(vld1q_s64(src_ptr + 2)),
+                                    vcvtq_f64_s64(vld1q_s64(src_ptr + 4)), vcvtq_f64_s64(vld1q_s64(src_ptr + 6))}};
+    const float64x2x4_t texels1 = {{vcvtq_f64_s64(vld1q_s64(src_ptr + 8)), vcvtq_f64_s64(vld1q_s64(src_ptr + 10)),
+                                    vcvtq_f64_s64(vld1q_s64(src_ptr + 12)), vcvtq_f64_s64(vld1q_s64(src_ptr + 14))}};
+    const float32x4x4_t texels  = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+                                    vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+                                    vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+                                    vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
     vst1q_f32(dst_ptr, texels.val[0]);
     vst1q_f32(dst_ptr + 4, texels.val[1]);
     vst1q_f32(dst_ptr + 8, texels.val[2]);
@@ -251,34 +227,15 @@
 template <>
 inline void internal_neon_convert<uint64_t, float>(const uint64_t *src_ptr, float *dst_ptr)
 {
-    const float64x2x4_t texels0 =
-    {
-        {
-            vcvtq_f64_u64(vld1q_u64(src_ptr)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 4)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 6))
-        }
-    };
-    const float64x2x4_t texels1 =
-    {
-        {
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 8)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 12)),
-            vcvtq_f64_u64(vld1q_u64(src_ptr + 14))
-        }
-    };
+    const float64x2x4_t texels0 = {{vcvtq_f64_u64(vld1q_u64(src_ptr)), vcvtq_f64_u64(vld1q_u64(src_ptr + 2)),
+                                    vcvtq_f64_u64(vld1q_u64(src_ptr + 4)), vcvtq_f64_u64(vld1q_u64(src_ptr + 6))}};
+    const float64x2x4_t texels1 = {{vcvtq_f64_u64(vld1q_u64(src_ptr + 8)), vcvtq_f64_u64(vld1q_u64(src_ptr + 10)),
+                                    vcvtq_f64_u64(vld1q_u64(src_ptr + 12)), vcvtq_f64_u64(vld1q_u64(src_ptr + 14))}};
 
-    const float32x4x4_t texels =
-    {
-        {
-            vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
-            vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))
-        }
-    };
+    const float32x4x4_t texels = {{vcombine_f32(vcvt_f32_f64(texels0.val[0]), vcvt_f32_f64(texels0.val[1])),
+                                   vcombine_f32(vcvt_f32_f64(texels0.val[2]), vcvt_f32_f64(texels0.val[3])),
+                                   vcombine_f32(vcvt_f32_f64(texels1.val[0]), vcvt_f32_f64(texels1.val[1])),
+                                   vcombine_f32(vcvt_f32_f64(texels1.val[2]), vcvt_f32_f64(texels1.val[3]))}};
 
     vst1q_f32(dst_ptr, texels.val[0]);
     vst1q_f32(dst_ptr + 4, texels.val[1]);
@@ -287,23 +244,26 @@
 }
 
 template <typename T1, typename T2>
-inline void convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
+inline void
+convert64(Iterator &src, Iterator &dst, const Window &win, int window_start_x, int window_end_x, int window_step_x)
 {
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
-        int        x       = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+            const auto src_ptr = reinterpret_cast<const T1 *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<T2 *>(dst.ptr());
+            int        x       = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                internal_neon_convert<T1, T2>(src_ptr + x, dst_ptr + x);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<T2>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 } // namespace
 #endif // __aarch64__
@@ -325,21 +285,22 @@
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
     /*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */
-    const auto *uk = CpuCastKernel::get_implementation(CastDataTypeISASelectorData{ _src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = CpuCastKernel::get_implementation(
+        CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()});
 
-    switch(_src->info()->data_type())
+    switch (_src->info()->data_type())
     {
 #ifdef __aarch64__
         case DataType::U64:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::F32:
                 {
@@ -353,7 +314,7 @@
         }
         case DataType::S64:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::F32:
                 {
@@ -369,111 +330,102 @@
 
         case DataType::QASYMM8_SIGNED:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::S16:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+                            int        x       = window_start_x;
 
-                            const int16x8x2_t texels =
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s16(dst_ptr + x, texels.val[0]);
+                                vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-                        int        x       = window_start_x;
-
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+                            int        x       = window_start_x;
 
-                            const int16x8x2_t texels =
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                                vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F32:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_s8(vget_low_s8(texels_s8)),
-                                    vmovl_s8(vget_high_s8(texels_s8))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
+                                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                const int16x8x2_t texels = {
+                                    {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F16:
@@ -492,111 +444,102 @@
         case DataType::QASYMM8:
         case DataType::U8:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::S16:
                 {
                     /* Up-conversion U8 -> S16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                            vst1q_s16(dst_ptr + x, texels.val[0]);
-                            vst1q_s16(dst_ptr + x + 8, texels.val[1]);
-                        }
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s16(dst_ptr + x, texels.val[0]);
+                                vst1q_s16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion U8 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                            vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
-                            vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
-                            vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
-                        }
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
+                                vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
+                                vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F32:
                 {
                     /* Up-conversion U8 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
 
-                            const int16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                                }
-                            };
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
-                        }
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::F16:
@@ -609,35 +552,32 @@
                 case DataType::U16:
                 {
                     /* Up-conversion U8 -> U16 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+                            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
 
-                            const uint16x8x2_t texels =
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vmovl_u8(vget_low_u8(texels_u8)),
-                                    vmovl_u8(vget_high_u8(texels_u8))
-                                }
-                            };
+                                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-                            vst1q_u16(dst_ptr + x, texels.val[0]);
-                            vst1q_u16(dst_ptr + x + 8, texels.val[1]);
-                        }
+                                const uint16x8x2_t texels = {
+                                    {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}};
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1q_u16(dst_ptr + x, texels.val[0]);
+                                vst1q_u16(dst_ptr + x + 8, texels.val[1]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 default:
@@ -647,177 +587,154 @@
         }
         case DataType::S16:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion S16 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
-                            }
+                                    vst1q_s8(dst_ptr + x,
+                                             vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
-                            }
+                                    vst1q_s8(dst_ptr + x,
+                                             vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
                 case DataType::U8:
                 {
                     /* Down-conversion S16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
-                            }
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s16(src_ptr + x),
-                                        vld1q_s16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
-                                                                  vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
-                            }
+                                    vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
+                                                                      vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion S16 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int16x8x2_t texels =
+                            const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vld1q_s16(src_ptr + x),
-                                    vld1q_s16(src_ptr + x + 8)
-                                }
-                            };
+                                const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
 
-                            const int32x4x4_t texels_s32 =
+                                const int32x4x4_t texels_s32 = {
+                                    {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])),
+                                     vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}};
+
+                                vst1q_s32(dst_ptr + x, texels_s32.val[0]);
+                                vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
+                                vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
+                                vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
+                            }
+
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
                             {
-                                {
-                                    vmovl_s16(vget_low_s16(texels.val[0])),
-                                    vmovl_s16(vget_high_s16(texels.val[0])),
-                                    vmovl_s16(vget_low_s16(texels.val[1])),
-                                    vmovl_s16(vget_high_s16(texels.val[1]))
-                                }
-                            };
-
-                            vst1q_s32(dst_ptr + x, texels_s32.val[0]);
-                            vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
-                            vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
-                            vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
-                        }
-
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 default:
@@ -828,104 +745,92 @@
 
         case DataType::U16:
         {
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::U8:
                 {
                     /* Down-conversion U16 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const uint16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
-                            }
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const uint16x8x2_t texels =
+                                const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_u16(src_ptr + x),
-                                        vld1q_u16(src_ptr + x + 8)
-                                    }
-                                };
+                                    const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
 
-                                vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
-                            }
+                                    vst1q_u8(dst_ptr + x,
+                                             vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
                 case DataType::U32:
                 {
                     /* Up-conversion U16 -> U32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const uint16x8x2_t texels =
+                            const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
-                                    vld1q_u16(src_ptr + x),
-                                    vld1q_u16(src_ptr + x + 8)
-                                }
-                            };
+                                const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
 
-                            vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
-                            vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
-                            vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
-                        }
-
-                    },
-                    src, dst);
+                                vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
+                                vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
+                                vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
+                                vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
+                            }
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 default:
@@ -941,7 +846,7 @@
             break;
         }
         case DataType::F32:
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
                 case DataType::F16:
                 {
@@ -953,105 +858,110 @@
                 case DataType::S32:
                 {
                     /* Conversion F32 -> S32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const float32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
+                                }};
 
-                            vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
-                            vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
-                            vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
-                            vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
-                        }
+                                vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
+                                vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
+                                vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
+                                vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
+                            }
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::QASYMM8:
                 case DataType::U8:
                 {
                     /* Down-conversion F32 -> U8 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const float32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
+                                }};
 
-                            vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
+                                vst1_u8(dst_ptr + x,
+                                        vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])),
+                                                                vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
+                                vst1_u8(dst_ptr + x + 8,
+                                        vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])),
+                                                                vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
+                            }
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion F32 -> QASYMM8_SIGNED */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const float32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
-                                }
-                            };
+                                }};
 
-                            vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
-                            vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
-                        }
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                                vst1_s8(dst_ptr + x,
+                                        vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])),
+                                                                vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
+                                vst1_s8(dst_ptr + x + 8,
+                                        vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])),
+                                                                vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
+                            }
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
 
@@ -1060,7 +970,7 @@
             }
             break;
         case DataType::S32:
-            switch(_dst->info()->data_type())
+            switch (_dst->info()->data_type())
             {
 #if __aarch64__
                 case DataType::S64:
@@ -1079,104 +989,102 @@
                 case DataType::F32:
                 {
                     /* Conversion S32 -> F32 */
-                    execute_window_loop(win, [&](const Coordinates &)
-                    {
-                        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                        const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                        int x = window_start_x;
-                        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                    execute_window_loop(
+                        win,
+                        [&](const Coordinates &)
                         {
-                            const int32x4x4_t texels =
+                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                            const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                            int x = window_start_x;
+                            for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
-                                {
+                                const int32x4x4_t texels = {{
                                     vld1q_s32(src_ptr + x),
                                     vld1q_s32(src_ptr + x + 4),
                                     vld1q_s32(src_ptr + x + 8),
                                     vld1q_s32(src_ptr + x + 12),
-                                }
-                            };
+                                }};
 
-                            vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
-                            vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
-                            vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
-                            vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
-                        }
+                                vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
+                                vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
+                                vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
+                                vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
+                            }
 
-                        // Compute left-over elements
-                        for(; x < window_end_x; ++x)
-                        {
-                            *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                        }
-                    },
-                    src, dst);
+                            // Compute left-over elements
+                            for (; x < window_end_x; ++x)
+                            {
+                                *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                            }
+                        },
+                        src, dst);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion S32 -> QASYMM8_SIGNED */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
+                                    const int32x4x4_t texels = {{
                                         vld1q_s32(src_ptr + x),
                                         vld1q_s32(src_ptr + x + 4),
                                         vld1q_s32(src_ptr + x + 8),
                                         vld1q_s32(src_ptr + x + 12),
-                                    }
-                                };
-                                vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
-                            }
+                                    }};
+                                    vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]),
+                                                                                 vqmovn_s32(texels.val[1]))));
+                                    vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]),
+                                                                                     vqmovn_s32(texels.val[3]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
 
-                                vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
-                                vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
-                            }
+                                    vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]),
+                                                                                vmovn_s32(texels.val[1]))));
+                                    vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]),
+                                                                                    vmovn_s32(texels.val[3]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }
@@ -1184,68 +1092,66 @@
                 case DataType::U8:
                 {
                     /* Down-conversion S32 -> U8 */
-                    if(ConvertPolicy::SATURATE == _policy)
+                    if (ConvertPolicy::SATURATE == _policy)
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
-                                vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
-                                vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
-                            }
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
+                                    vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]),
+                                                                                 vqmovun_s32(texels.val[1]))));
+                                    vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]),
+                                                                                     vqmovun_s32(texels.val[3]))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     else
                     {
-                        execute_window_loop(win, [&](const Coordinates &)
-                        {
-                            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-                            const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                            int x = window_start_x;
-                            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+                        execute_window_loop(
+                            win,
+                            [&](const Coordinates &)
                             {
-                                const int32x4x4_t texels =
+                                const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+                                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                                int x = window_start_x;
+                                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
-                                    {
-                                        vld1q_s32(src_ptr + x),
-                                        vld1q_s32(src_ptr + x + 4),
-                                        vld1q_s32(src_ptr + x + 8),
-                                        vld1q_s32(src_ptr + x + 12)
-                                    }
-                                };
+                                    const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
+                                                                 vld1q_s32(src_ptr + x + 8),
+                                                                 vld1q_s32(src_ptr + x + 12)}};
 
-                                vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
-                                vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
-                            }
+                                    vst1_u8(dst_ptr + x,
+                                            vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),
+                                                                   vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
+                                    vst1_u8(dst_ptr + x + 8,
+                                            vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),
+                                                                   vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
+                                }
 
-                            // Compute left-over elements
-                            for(; x < window_end_x; ++x)
-                            {
-                                *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
-                            }
-                        },
-                        src, dst);
+                                // Compute left-over elements
+                                for (; x < window_end_x; ++x)
+                                {
+                                    *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
+                                }
+                            },
+                            src, dst);
                     }
                     break;
                 }

diff --git a/src/cpu/kernels/CpuCastKernel.h b/src/cpu/kernels/CpuCastKernel.h
index a7e6417..ddbfe1f 100644
--- a/src/cpu/kernels/CpuCastKernel.h
+++ b/src/cpu/kernels/CpuCastKernel.h

@@ -40,7 +40,8 @@
 class CpuCastKernel : public ICpuKernel<CpuCastKernel>
 {
 private:
-    using CastKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type;
+    using CastKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const ThreadInfo &, ConvertPolicy, const Window &)>::type;
 
 public:
     CpuCastKernel() = default;
@@ -76,7 +77,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct CastKernel
@@ -89,7 +90,7 @@
     static const std::vector<CastKernel> &get_available_kernels();
 
 private:
-    ConvertPolicy _policy{ ConvertPolicy::SATURATE };
+    ConvertPolicy _policy{ConvertPolicy::SATURATE};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuCol2ImKernel.cpp b/src/cpu/kernels/CpuCol2ImKernel.cpp
index bf5a44d..a52a1f5 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.cpp
+++ b/src/cpu/kernels/CpuCol2ImKernel.cpp

@@ -29,8 +29,9 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -49,9 +50,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
     // Validate configured output
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, false));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           compute_col2im_shape(*src, convolved_dims, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -106,13 +108,16 @@
     Iterator in(src, window);
     Iterator out(dst, window_out);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const int hidx = id.y();
-        const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y + (hidx % _convolved_dims.width) * output_stride_x;
-        std::memcpy(out.ptr() + idx, in.ptr(), el_size);
-    },
-    in, out);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const int hidx = id.y();
+            const int idx  = id.x() * output_stride_z + (hidx / _convolved_dims.width) * output_stride_y +
+                            (hidx % _convolved_dims.width) * output_stride_x;
+            std::memcpy(out.ptr() + idx, in.ptr(), el_size);
+        },
+        in, out);
 }
 
 const char *CpuCol2ImKernel::name() const
@@ -121,4 +126,4 @@
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuCol2ImKernel.h b/src/cpu/kernels/CpuCol2ImKernel.h
index deafcc1..3e394ac 100644
--- a/src/cpu/kernels/CpuCol2ImKernel.h
+++ b/src/cpu/kernels/CpuCol2ImKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_COL2IM_KERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -75,7 +76,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:

diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
index 29d40f0..8c29017 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.cpp

@@ -30,10 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -50,13 +51,14 @@
     uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
 
     // Offset dst
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + batch_offset * dst->info()->strides_in_bytes()[3];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       batch_offset * dst->info()->strides_in_bytes()[3];
 
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
     const int  window_step_x  = 16 / dst->info()->element_size();
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(3, Window::Dimension(0, src->info()->tensor_shape()[3], 1));
 
@@ -66,66 +68,74 @@
     const DataType                dt        = src->info()->data_type();
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
+                const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr, vquantize(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr,
+                                    vquantize_signed(vdequantize(wrapper::vloadq(in_ptr), src_qinfo), dst_qinfo));
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
+                const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
@@ -154,7 +164,7 @@
     _func         = nullptr;
     _batch_offset = batch_offset;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::S8:
         case DataType::U8:
@@ -196,9 +206,7 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _batch_offset,
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _batch_offset,
              window);
 }
 

diff --git a/src/cpu/kernels/CpuConcatenateBatchKernel.h b/src/cpu/kernels/CpuConcatenateBatchKernel.h
index 0de68a5..52ea553 100644
--- a/src/cpu/kernels/CpuConcatenateBatchKernel.h
+++ b/src/cpu/kernels/CpuConcatenateBatchKernel.h

@@ -57,15 +57,15 @@
     static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
     using BatchConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
 
 private:
-    BatchConcatFunction *_func{ nullptr };
-    unsigned int         _batch_offset{ 0 };
+    BatchConcatFunction *_func{nullptr};
+    unsigned int         _batch_offset{0};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
index ebc5322..c75e1e4 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.cpp

@@ -30,11 +30,12 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <cstdint>
 
@@ -53,13 +54,14 @@
     uint8_t *src_ptr = src->buffer() + src->info()->offset_first_element_in_bytes();
 
     // Offset destination
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + depth_offset * dst->info()->strides_in_bytes()[2];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       depth_offset * dst->info()->strides_in_bytes()[2];
 
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
     const int  window_step_x  = 16 / dst->info()->element_size();
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimZ, Window::Dimension(0, src->info()->tensor_shape().z(), 1));
 
@@ -69,64 +71,73 @@
     const DataType                dt        = src->info()->data_type();
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
+                const auto in_ptr  = reinterpret_cast<const uint8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<uint8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x,
+                                    vquantize(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = quantize_qasymm8(dequantize_qasymm8(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
-            }
+                const auto in_ptr  = reinterpret_cast<const int8_t *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x,
+                                    vquantize_signed(vdequantize(wrapper::vloadq(in_ptr + x), src_qinfo), dst_qinfo));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(in_ptr + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
-            const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                const auto in_ptr  = reinterpret_cast<const T *>(src_ptr + src_it.offset());
+                const auto out_ptr = reinterpret_cast<T *>(dst_ptr + dst_it.offset());
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 
@@ -134,7 +145,8 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(Window::DimX) != output->dimension(Window::DimX));
@@ -154,7 +166,7 @@
     _func         = nullptr;
     _depth_offset = depth_offset;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
             _func = &depth_concat<uint8_t>;
@@ -192,9 +204,7 @@
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);
 
-    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC),
-             tensors.get_tensor(TensorType::ACL_DST),
-             _depth_offset,
+    (*_func)(tensors.get_const_tensor(TensorType::ACL_SRC), tensors.get_tensor(TensorType::ACL_DST), _depth_offset,
              window);
 }
 

diff --git a/src/cpu/kernels/CpuConcatenateDepthKernel.h b/src/cpu/kernels/CpuConcatenateDepthKernel.h
index 5a0edb9..54de9af 100644
--- a/src/cpu/kernels/CpuConcatenateDepthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateDepthKernel.h

@@ -65,15 +65,15 @@
     static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
     using DepthConcatFunction = void(const ITensor *, ITensor *, unsigned int, const Window &);
 
 private:
-    DepthConcatFunction *_func{ nullptr };
-    unsigned int         _depth_offset{ 0 };
+    DepthConcatFunction *_func{nullptr};
+    unsigned int         _depth_offset{0};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
index 47a2b44..b6c11d9 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.cpp

@@ -30,10 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <cstdint>
 
@@ -53,7 +54,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -91,13 +92,14 @@
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
     // Offset destination pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       _height_offset * dst->info()->strides_in_bytes()[Window::DimY];
 
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
     const int  window_step_x  = 16;
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     win.set(Window::DimY, Window::Dimension(0, src->info()->tensor_shape().y(), 1));
 
@@ -108,64 +110,74 @@
     const DataType                 dt        = src->info()->data_type();
     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_u8(dst_ptr + dst_it.offset() + x,
+                             vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-
-        },
-        src_it, dst_it);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo), dst_qinfo));
-            }
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_s8(
+                        reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+                        vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr()) + x), src_qinfo),
+                                         dst_qinfo));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
+                const auto in_ptr  = src_it.ptr();
+                const auto out_ptr = dst_ptr + dst_it.offset();
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 

diff --git a/src/cpu/kernels/CpuConcatenateHeightKernel.h b/src/cpu/kernels/CpuConcatenateHeightKernel.h
index 74d5d0c..df880c4 100644
--- a/src/cpu/kernels/CpuConcatenateHeightKernel.h
+++ b/src/cpu/kernels/CpuConcatenateHeightKernel.h

@@ -58,11 +58,11 @@
     static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    unsigned int _height_offset{ 0 };
+    unsigned int _height_offset{0};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
index f00b37a..f6100cc 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.cpp
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.cpp

@@ -24,12 +24,12 @@
 #include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Steps.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 namespace arm_compute
 {
@@ -47,7 +47,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -86,13 +86,14 @@
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
     // Offset output pointer to the correct position
-    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() + _width_offset * dst->info()->strides_in_bytes()[0];
+    uint8_t *dst_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes() +
+                       _width_offset * dst->info()->strides_in_bytes()[0];
 
     const auto    window_start_x = static_cast<int>(window.x().start());
     const auto    window_end_x   = static_cast<int>(window.x().end()) * static_cast<int>(dst->info()->element_size());
     constexpr int window_step_x  = 16;
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     // Create iterators
@@ -101,62 +102,73 @@
     const DataType                 dt        = src->info()->data_type();
     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo &dst_qinfo = dst->info()->quantization_info().uniform();
-    if(dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
+    if (dt == DataType::QASYMM8 && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                vst1q_u8(dst_ptr + dst_it.offset() + x, vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
-            }
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_u8(dst_ptr + dst_it.offset() + x,
+                             vquantize(vdequantize(vld1q_u8(src_it.ptr() + x), src_qinfo), dst_qinfo));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8(dequantize_qasymm8(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
-    else if(dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
+    else if (dt == DataType::QASYMM8_SIGNED && src_qinfo != dst_qinfo)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                vst1q_s8(reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
-                         vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo), dst_qinfo));
-            }
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    vst1q_s8(
+                        reinterpret_cast<int8_t *>(dst_ptr + dst_it.offset() + x),
+                        vquantize_signed(vdequantize(vld1q_s8(reinterpret_cast<int8_t *>(src_it.ptr() + x)), src_qinfo),
+                                         dst_qinfo));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(dst_ptr + dst_it.offset() + x) = quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
-            }
-        },
-        src_it, dst_it);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(dst_ptr + dst_it.offset() + x) =
+                        quantize_qasymm8_signed(dequantize_qasymm8_signed(*(src_it.ptr() + x), src_qinfo), dst_qinfo);
+                }
+            },
+            src_it, dst_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto in_ptr  = src_it.ptr();
-            const auto out_ptr = dst_ptr + dst_it.offset();
-            int        x       = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
-            }
+                const auto in_ptr  = src_it.ptr();
+                const auto out_ptr = dst_ptr + dst_it.offset();
+                int        x       = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    wrapper::vstore(out_ptr + x, wrapper::vloadq(in_ptr + x));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                *(out_ptr + x) = *(in_ptr + x);
-            }
-        },
-        src_it, dst_it);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    *(out_ptr + x) = *(in_ptr + x);
+                }
+            },
+            src_it, dst_it);
     }
 }
 

diff --git a/src/cpu/kernels/CpuConcatenateWidthKernel.h b/src/cpu/kernels/CpuConcatenateWidthKernel.h
index 418bc51..560e44e 100644
--- a/src/cpu/kernels/CpuConcatenateWidthKernel.h
+++ b/src/cpu/kernels/CpuConcatenateWidthKernel.h

@@ -58,11 +58,11 @@
     static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    unsigned int _width_offset{ 0 };
+    unsigned int _width_offset{0};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
index 08b39de..87703ec 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -34,8 +35,10 @@
 {
 namespace kernels
 {
-void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                      DataLayout data_layout)
+void CpuConvertFullyConnectedWeightsKernel::configure(const ITensorInfo *src,
+                                                      ITensorInfo       *dst,
+                                                      const TensorShape &original_input_shape,
+                                                      DataLayout         data_layout)
 
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -43,7 +46,8 @@
     // Output tensor auto initialisation if not yet initialized
     auto_init_if_empty(*dst, *src->clone());
 
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_input_shape, data_layout));
 
     const DataLayout input_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
 
@@ -62,8 +66,10 @@
     ICpuKernel::configure(win);
 }
 
-Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape,
-                                                       DataLayout data_layout)
+Status CpuConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+                                                       const ITensorInfo *dst,
+                                                       const TensorShape &original_input_shape,
+                                                       DataLayout         data_layout)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
@@ -72,7 +78,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     // Checks performed when dst is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -97,11 +103,15 @@
     Iterator input(src, window);
     Iterator output(dst, window);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        memcpy(output.ptr() + id.x() * dst_stride_x + (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y, input.ptr(), element_size);
-    },
-    input);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            memcpy(output.ptr() + id.x() * dst_stride_x +
+                       (id.y() % _factor1 * _factor2 + id.y() / _factor1) * dst_stride_y,
+                   input.ptr(), element_size);
+        },
+        input);
 }
 
 const char *CpuConvertFullyConnectedWeightsKernel::name() const

diff --git a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
index 9a13933..2253889 100644
--- a/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h
+++ b/src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h

@@ -53,24 +53,32 @@
      * @param[in] original_input_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in] data_layout          The data layout the weights have been trained in.
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
+    void configure(const ITensorInfo *src,
+                   ITensorInfo       *dst,
+                   const TensorShape &original_input_shape,
+                   DataLayout         data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuConvertFullyConnectedWeightsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_input_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_input_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    unsigned int _factor1{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
-    unsigned int _factor2{ 0 }; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
+    unsigned int _factor1{
+        0}; /* equals to the number of elements per original src plane if @p data_layout == NCHW; its number of channels otherwise */
+    unsigned int _factor2{
+        0}; /* equals to the number of elements per original src plane if @p data_layout == NHWC; its number of channels otherwise */
 };
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */

diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
index 1005d00..745b156 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.cpp

@@ -29,9 +29,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -47,7 +48,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
 
     // Validate output if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
@@ -60,11 +61,11 @@
 {
     // Output auto inizialitation if not yet initialized
     {
-        const bool                    is_input_signed   = src->data_type() == DataType::QASYMM8_SIGNED;
-        const DataType                dt                = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
-        const UniformQuantizationInfo qinfo             = src->quantization_info().uniform();
+        const bool                    is_input_signed = src->data_type() == DataType::QASYMM8_SIGNED;
+        const DataType                dt              = is_input_signed ? DataType::QASYMM8 : DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo qinfo           = src->quantization_info().uniform();
         const int                     offset_correction = is_input_signed ? -128 : 128;
-        const QuantizationInfo        corrected_qinfo   = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
+        const QuantizationInfo        corrected_qinfo = QuantizationInfo(qinfo.scale, qinfo.offset + offset_correction);
 
         auto_init_if_empty(*dst, src->clone()->set_data_type(dt).set_quantization_info(corrected_qinfo));
     }
@@ -110,27 +111,29 @@
     const uint8_t mask  = 128;
     const auto    vmask = wrapper::vdup_n(mask, wrapper::traits::vector_128_tag{});
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
-        }
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
-            *(output_ptr + x) = in ^ mask;
-        }
-    },
-    input, output);
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                wrapper::vstore(output_ptr + x, wrapper::veor(vin, vmask));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const uint8_t in  = *(reinterpret_cast<const uint8_t *>(input_ptr + x));
+                *(output_ptr + x) = in ^ mask;
+            }
+        },
+        input, output);
 }
 
 const char *CpuConvertQuantizedSignednessKernel::name() const

diff --git a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
index b5eaf65..e94d3d5 100644
--- a/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h
+++ b/src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h

@@ -54,7 +54,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuCopyKernel.cpp b/src/cpu/kernels/CpuCopyKernel.cpp
index 3f0f3fe..1b693d7 100644
--- a/src/cpu/kernels/CpuCopyKernel.cpp
+++ b/src/cpu/kernels/CpuCopyKernel.cpp

@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -48,9 +49,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON(padding.size() > 4);
 
     // Validate destination if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            misc::shape_calculator::compute_padded_shape(src->tensor_shape(), padding), dst->tensor_shape());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
@@ -64,7 +66,8 @@
     return std::make_pair(Status{}, calculate_max_window(*dst));
 }
 
-std::pair<Status, Window> validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
+std::pair<Status, Window>
+validate_and_configure_window_with_padding(const ITensorInfo *src, ITensorInfo *dst, const PaddingList &padding)
 {
     const TensorShape src_shape    = src->tensor_shape();
     const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(src_shape, padding);
@@ -84,7 +87,7 @@
     _padding = padding;
 
     std::pair<Status, Window> win_config;
-    if(padding.empty())
+    if (padding.empty())
     {
         win_config = validate_and_configure_window(src, dst);
     }
@@ -97,17 +100,20 @@
     ICpuKernel::configure(win_config.second);
 }
 
-Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, const PaddingList &padding)
+Status CpuCopyKernel::validate(const arm_compute::ITensorInfo *src,
+                               const arm_compute::ITensorInfo *dst,
+                               const PaddingList              &padding)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, padding));
 
-    if(padding.empty())
+    if (padding.empty())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_with_padding(src->clone().get(), dst->clone().get(), padding).first);
     }
 
     return Status{};
@@ -122,38 +128,41 @@
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_padding.empty())
+    if (_padding.empty())
     {
-        Window dst_window{ window };
-        dst_window.set(Window::DimX, Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
+        Window dst_window{window};
+        dst_window.set(Window::DimX,
+                       Window::Dimension(dst_window.x().start(), dst_window.x().end(), src->info()->dimension(0)));
         Window out_slice = dst_window.first_slice_window_1D();
         do
         {
             Iterator src_it(src, out_slice);
             Iterator dst_it(dst, out_slice);
 
-            execute_window_loop(out_slice, [&](const Coordinates &)
-            {
-                memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size());
-            },
-            src_it, dst_it);
-        }
-        while(dst_window.slide_window_slice_1D(out_slice));
+            execute_window_loop(
+                out_slice,
+                [&](const Coordinates &)
+                { memcpy(dst_it.ptr(), src_it.ptr(), dst->info()->dimension(0) * dst->info()->element_size()); },
+                src_it, dst_it);
+        } while (dst_window.slide_window_slice_1D(out_slice));
     }
     else
     {
-        Window src_window{ window };
-        src_window.set(Window::DimX, Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
+        Window src_window{window};
+        src_window.set(Window::DimX,
+                       Window::Dimension(0, window.x().end() - _padding[0].first, src->info()->dimension(0)));
 
         Iterator     src_it(src, src_window);
         Iterator     dst_it(dst, window);
         const size_t row_size_in_bytes = src->info()->dimension(0) * src->info()->element_size();
-        execute_window_loop(window, [&](const Coordinates &)
-        {
-            auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
-            std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &)
+            {
+                auto dst_ptr = dst_it.ptr() + _padding[0].first * dst->info()->element_size();
+                std::memcpy(dst_ptr, src_it.ptr(), row_size_in_bytes);
+            },
+            src_it, dst_it);
     }
 }
 

diff --git a/src/cpu/kernels/CpuCopyKernel.h b/src/cpu/kernels/CpuCopyKernel.h
index c9ef8eb..a05053f 100644
--- a/src/cpu/kernels/CpuCopyKernel.h
+++ b/src/cpu/kernels/CpuCopyKernel.h

@@ -55,7 +55,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PaddingList &padding = PaddingList());
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:

diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
index d6c56d2..82e3a5c 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.cpp

@@ -26,11 +26,12 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/traits.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/traits.h"
 #include "src/cpu/kernels/depthwiseconv2d/list.h"
 
 namespace arm_compute
@@ -41,72 +42,53 @@
 {
 namespace
 {
-static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels =
-{
-    {
-        "neon_qu8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QASYMM8);
-        },
-        REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)
-    },
-    {
-        "neon_qs8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QASYMM8_SIGNED);
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)
-    },
-    {
-        "neon_fp16_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::F16 && data.isa.fp16);
-        },
-        REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)
-    },
-    {
-        "neon_fp32_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::F32);
-        },
-        REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)
-    },
-    {
-        "neon_qp8_qu8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8);
-        },
-        REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)
-    },
-    {
-        "neon_qp8_qs8_deptwiseconv2dnative",
-        [](const DepthwiseConv2dNativeDataTypeISASelectorData & data)
-        {
-            return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8);
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)
-    },
+static const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> available_kernels = {
+    {"neon_qu8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qu8_deptwiseconv2dnative)},
+    {"neon_qs8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qs8_deptwiseconv2dnative)},
+    {"neon_fp16_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::F16 && data.isa.fp16); },
+     REGISTER_FP16_NEON(neon_fp16_deptwiseconv2dnative)},
+    {"neon_fp32_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data) { return (data.weights_dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_deptwiseconv2dnative)},
+    {"neon_qp8_qu8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qp8_qu8_deptwiseconv2dnative)},
+    {"neon_qp8_qs8_deptwiseconv2dnative",
+     [](const DepthwiseConv2dNativeDataTypeISASelectorData &data)
+     { return (data.weights_dt == DataType::QSYMM8_PER_CHANNEL && data.source_dt != DataType::QASYMM8); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qp8_qs8_deptwiseconv2dnative)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status validate_arguments(const ITensorInfo     *src,
+                          const ITensorInfo     *weights,
+                          const ITensorInfo     *biases,
+                          const ITensorInfo     *dst,
+                          const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(info.depth_multiplier == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) >
+                                src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) >
+                                src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
     ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(0) * info.depth_multiplier) != weights->dimension(0));
     ARM_COMPUTE_RETURN_ERROR_ON((info.dilation.x() < 1) || (info.dilation.y() < 1));
-    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) || (info.pad_stride_info.stride().second < 1));
+    ARM_COMPUTE_RETURN_ERROR_ON((info.pad_stride_info.stride().first < 1) ||
+                                (info.pad_stride_info.stride().second < 1));
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
@@ -116,12 +98,12 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(0));
 
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -131,9 +113,10 @@
         }
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
@@ -142,7 +125,11 @@
 }
 } // namespace
 
-void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2dNativeKernel::configure(const ITensorInfo     *src,
+                                               const ITensorInfo     *weights,
+                                               const ITensorInfo     *biases,
+                                               ITensorInfo           *dst,
+                                               const ConvolutionInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, (biases != nullptr) ? biases : nullptr, dst, info));
@@ -151,18 +138,26 @@
     _conv_info  = info;
 
     const auto uk = CpuDepthwiseConv2dNativeKernel::get_implementation(
-                        DepthwiseConv2dNativeDataTypeISASelectorData{ weights->data_type(), src->data_type(), CPUInfo::get().get_isa() });
+        DepthwiseConv2dNativeDataTypeISASelectorData{weights->data_type(), src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
     _func = uk->ukernel;
 
     const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
-    auto_init_if_empty(*dst, src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(dst->quantization_info()));
+    auto_init_if_empty(*dst, src->clone()
+                                 ->set_is_resizable(true)
+                                 .reset_padding()
+                                 .set_tensor_shape(output_shape)
+                                 .set_quantization_info(dst->quantization_info()));
 
     Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
 }
 
-Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dNativeKernel::validate(const ITensorInfo     *src,
+                                                const ITensorInfo     *weights,
+                                                const ITensorInfo     *biases,
+                                                const ITensorInfo     *dst,
+                                                const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, info));
     return Status{};
@@ -187,7 +182,8 @@
     return "CpuDepthwiseConv2dNativeKernel";
 }
 
-const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &CpuDepthwiseConv2dNativeKernel::get_available_kernels()
+const std::vector<CpuDepthwiseConv2dNativeKernel::DepthwiseConv2dNativeKernel> &
+CpuDepthwiseConv2dNativeKernel::get_available_kernels()
 {
     return available_kernels;
 }

diff --git a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
index 9fabd0b..7e78f52 100644
--- a/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h
+++ b/src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 #include "support/AclRequires.h"
@@ -44,8 +45,9 @@
 class CpuDepthwiseConv2dNativeKernel : public ICpuKernel<CpuDepthwiseConv2dNativeKernel>
 {
 private:
-    using DepthwiseConv2dNativeKernelPtr =
-        std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::type;
+    using DepthwiseConv2dNativeKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, const ITensor *, ITensor *, const Window &, bool, const ConvolutionInfo &)>::
+        type;
 
 public:
     CpuDepthwiseConv2dNativeKernel() = default;
@@ -64,17 +66,25 @@
      * @param[in]  info    Depthwise convolution meta-data.
      *
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *biases,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDepthwiseConv2dNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *biases,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
     struct DepthwiseConv2dNativeKernel
     {
@@ -89,9 +99,9 @@
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    DepthwiseConv2dNativeKernelPtr _func{ nullptr };
+    DepthwiseConv2dNativeKernelPtr _func{nullptr};
     ConvolutionInfo                _conv_info{};
-    bool                           _has_biases{ false };
+    bool                           _has_biases{false};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuDequantizeKernel.cpp b/src/cpu/kernels/CpuDequantizeKernel.cpp
index a2d24f9..d17128b 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.cpp
+++ b/src/cpu/kernels/CpuDequantizeKernel.cpp

@@ -28,12 +28,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
@@ -48,9 +49,11 @@
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+                                                         DataType::QSYMM16);
 
-    if(dst->tensor_shape().total_size() > 0)
+    if (dst->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
@@ -124,28 +127,30 @@
     Iterator in(input, win_collapsed);
     Iterator out(output, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale, offset);
+            const auto in_ptr  = reinterpret_cast<const TIn *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
 
-            store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale, offset);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto val       = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
-        }
-    },
-    in, out);
+                store_result(reinterpret_cast<TOut *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                auto val       = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<TOut>(Qasymm8QuantizationHelper<TIn>::dequantize(val, qinfo));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -165,28 +170,30 @@
     Iterator in(input, win);
     Iterator out(output, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale[id.z()]);
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale[id.z()]);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
-        }
-    },
-    in, out);
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale[id.z()]));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -206,37 +213,34 @@
     Iterator in(input, win);
     Iterator out(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t vscale =
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3],
-                    scale[x + 4], scale[x + 5], scale[x + 6], scale[x + 7],
-                    scale[x + 8], scale[x + 9], scale[x + 10], scale[x + 11],
-                    scale[x + 12], scale[x + 13], scale[x + 14], scale[x + 15]
-                }
-            };
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, vscale);
+                const float32x4x4_t vscale = {{scale[x + 0], scale[x + 1], scale[x + 2], scale[x + 3], scale[x + 4],
+                                               scale[x + 5], scale[x + 6], scale[x + 7], scale[x + 8], scale[x + 9],
+                                               scale[x + 10], scale[x + 11], scale[x + 12], scale[x + 13],
+                                               scale[x + 14], scale[x + 15]}};
+                const auto          vin    = wrapper::vloadq(in_ptr + x);
+                const auto          vdeq   = vdequantize(vin, vscale);
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
-        }
-    },
-    in, out);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale[x]));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -257,28 +261,30 @@
     Iterator in(input, win_collapsed);
     Iterator out(output, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize(vin, scale);
+            const auto in_ptr  = reinterpret_cast<const int8_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize(vin, scale);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int8_t val     = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
-        }
-    },
-    in, out);
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int8_t val     = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize(val, scale));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
@@ -299,34 +305,36 @@
     Iterator in(input, win_collapsed);
     Iterator out(output, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin  = wrapper::vloadq(in_ptr + x);
-            const auto vdeq = vdequantize_int16(vin, scale);
+            const auto in_ptr  = reinterpret_cast<const int16_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
 
-            store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
-        }
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vin  = wrapper::vloadq(in_ptr + x);
+                const auto vdeq = vdequantize_int16(vin, scale);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int16_t val    = *(in_ptr + x);
-            *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
-        }
-    },
-    in, out);
+                store_result<T>(reinterpret_cast<T *>(out_ptr + x), vdeq);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int16_t val    = *(in_ptr + x);
+                *(out_ptr + x) = static_cast<T>(dequantize_qsymm16(val, scale));
+            }
+        },
+        in, out);
 }
 
 template <typename T>
 void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window)
 {
-    switch(input->info()->data_type())
+    switch (input->info()->data_type())
     {
         case DataType::QASYMM8:
             run_dequantization_qasymm8<T, uint8_t>(input, output, window);
@@ -335,7 +343,9 @@
             run_dequantization_qasymm8<T, int8_t>(input, output, window);
             break;
         case DataType::QSYMM8_PER_CHANNEL:
-            input->info()->data_layout() == DataLayout::NHWC ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window) : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
+            input->info()->data_layout() == DataLayout::NHWC
+                ? run_dequantization_qsymm8_per_channel_nhwc<T>(input, output, window)
+                : run_dequantization_qsymm8_per_channel_nchw<T>(input, output, window);
             break;
         case DataType::QSYMM8:
             run_dequantization_qsymm8<T>(input, output, window);
@@ -377,7 +387,7 @@
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(dst->info()->data_type())
+    switch (dst->info()->data_type())
     {
         case DataType::F32:
             run_dequantization_core<float>(src, dst, window);

diff --git a/src/cpu/kernels/CpuDequantizeKernel.h b/src/cpu/kernels/CpuDequantizeKernel.h
index cfa991d..6ed5858 100644
--- a/src/cpu/kernels/CpuDequantizeKernel.h
+++ b/src/cpu/kernels/CpuDequantizeKernel.h

@@ -54,7 +54,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.cpp b/src/cpu/kernels/CpuDirectConv2dKernel.cpp
index a4cddde..4cb0fb1 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.cpp

@@ -22,13 +22,14 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/CpuDirectConv2dKernel.h"
-#include "src/cpu/kernels/directconv2d/list.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/directconv2d/list.h"
 
 using namespace arm_compute::detail;
 
@@ -38,26 +39,25 @@
 {
 namespace kernels
 {
-static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels =
-{
-    {
-        "neon_fp32_nhwc_directconv2d",
-        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
-        REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)
-    },
-    {
-        "neon_fp32_nchw_directconv2d",
-        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
-        REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)
-    },
-    {
-        "neon_fp16_nchw_directconv2d",
-        [](const DataTypeDataLayoutISASelectorData & data) { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)
-    },
+static const std::vector<CpuDirectConv2dKernel::DirectConv2dKernel> available_kernels = {
+    {"neon_fp32_nhwc_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F32 && data.dl == DataLayout::NHWC; },
+     REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nhwc_directconv2d)},
+    {"neon_fp32_nchw_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F32 && data.dl == DataLayout::NCHW; },
+     REGISTER_FP32_NEON(arm_compute::cpu::kernels::neon_fp32_nchw_directconv2d)},
+    {"neon_fp16_nchw_directconv2d",
+     [](const DataTypeDataLayoutISASelectorData &data)
+     { return data.dt == DataType::F16 && data.dl == DataLayout::NCHW && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::kernels::neon_fp16_nchw_directconv2d)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *dst,
+                          const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
@@ -76,7 +76,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
     ARM_COMPUTE_UNUSED(width_idx);
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
@@ -100,11 +100,15 @@
     // Configure window without any padding
     win = calculate_max_window(*dst, Steps());
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 
-void CpuDirectConv2dKernel::configure(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info)
+void CpuDirectConv2dKernel::configure(ITensorInfo         *src,
+                                      ITensorInfo         *weights,
+                                      ITensorInfo         *dst,
+                                      const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -129,12 +133,13 @@
     ICpuKernel::configure(win_config.second);
 }
 
-Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status CpuDirectConv2dKernel::validate(const ITensorInfo   *src,
+                                       const ITensorInfo   *weights,
+                                       const ITensorInfo   *dst,
+                                       const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
-                                                              dst->clone().get())
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
 
     return Status{};
 }
@@ -149,7 +154,8 @@
     auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     auto dst     = tensors.get_tensor(TensorType::ACL_DST);
 
-    const auto *uk = CpuDirectConv2dKernel::get_implementation(DataTypeDataLayoutISASelectorData{ src->info()->data_type(), _data_layout, CPUInfo::get().get_isa() });
+    const auto *uk = CpuDirectConv2dKernel::get_implementation(
+        DataTypeDataLayoutISASelectorData{src->info()->data_type(), _data_layout, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     uk->ukernel(window, src, weights, dst, _conv_info);

diff --git a/src/cpu/kernels/CpuDirectConv2dKernel.h b/src/cpu/kernels/CpuDirectConv2dKernel.h
index b9265dc..ad4caea 100644
--- a/src/cpu/kernels/CpuDirectConv2dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dKernel.h

@@ -37,7 +37,8 @@
 class CpuDirectConv2dKernel : public ICpuKernel<CpuDirectConv2dKernel>
 {
 private:
-    using DirectConv2dKernel_Ptr = std::add_pointer<void(const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
+    using DirectConv2dKernel_Ptr = std::add_pointer<void(
+        const Window &, const ITensor *, const ITensor *, ITensor *, const PadStrideInfo &)>::type;
 
 public:
     CpuDirectConv2dKernel() = default;
@@ -64,10 +65,13 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info);
+    static Status validate(const ITensorInfo   *src,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *dst,
+                           const PadStrideInfo &conv_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct DirectConv2dKernel
@@ -81,8 +85,8 @@
 
 private:
     PadStrideInfo _conv_info{};
-    unsigned int  _kernel_size{ 0 };
-    DataLayout    _data_layout{ DataLayout::UNKNOWN };
+    unsigned int  _kernel_size{0};
+    DataLayout    _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
index 93ad5e5..d4af8be 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.cpp

@@ -27,15 +27,16 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 #include <cstddef>
@@ -49,7 +50,9 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status validate_arguments(const ITensorInfo                                 *src,
+                          const ITensorInfo                                 *bias,
+                          const ITensorInfo                                 *dst,
                           const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
@@ -57,22 +60,23 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::S32, DataType::F32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
-        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL)));
+        ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != src->dimension(get_data_layout_dimension_index(
+                                                              src->data_layout(), DataLayoutDimension::CHANNEL)));
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
     }
 
-    if(src->data_type() == DataType::S32)
+    if (src->data_type() == DataType::S32)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst == nullptr, "In-place computation not allowed for quantized output");
     }
 
     // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
-        if(is_data_type_float(src->data_type()))
+        if (is_data_type_float(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         }
@@ -82,10 +86,11 @@
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
-    else if(src->data_type() == DataType::S32)
+    else if (src->data_type() == DataType::S32)
     {
         // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
-        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
+        ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) &&
+                                    (info.output_data_type != DataType::QASYMM8_SIGNED));
     }
 
     return Status{};
@@ -93,8 +98,13 @@
 
 template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+output_stage_nchw(ITensor       *src,
+                  const ITensor *bias,
+                  const Window  &window,
+                  ITensor       *dst,
+                  int            result_fixedpoint_multiplier,
+                  int            result_shift,
+                  int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     /** SIMD vector tag type. */
@@ -113,50 +123,57 @@
 
     Iterator in(src, win);
     Iterator out(dst, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
-            auto       v_in   = wrapper::vloadq(in_ptr);
-
-            // Accumulate bias
-            if(has_bias)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
-                v_in          = wrapper::vadd(v_in, vb);
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+                auto       v_in   = wrapper::vloadq(in_ptr);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+                    v_in = wrapper::vadd(v_in, vb);
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, v_in);
             }
 
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
             {
-                const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
+
+                *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
             }
-
-            *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
-        }
-
-    },
-    in, out);
+        },
+        in, out);
 }
 
 template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
-output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                  int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+output_stage_nhwc(ITensor       *src,
+                  const ITensor *bias,
+                  const Window  &window,
+                  ITensor       *dst,
+                  int            result_fixedpoint_multiplier,
+                  int            result_shift,
+                  int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
@@ -179,50 +196,59 @@
     Iterator bi(bias, window_bias);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
-            auto       v_in   = wrapper::vloadq(in_ptr + x);
-
-            // Accumulate bias
-            if(has_bias)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+                auto       v_in   = wrapper::vloadq(in_ptr + x);
+
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                wrapper::vstore(out_ptr + x, v_in);
             }
 
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            wrapper::vstore(out_ptr + x, v_in);
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
             {
-                const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
+                // Get bias and pointer to input
+                auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
 
-            const auto out_ptr = reinterpret_cast<T *>(out.ptr());
-            *(out_ptr + x)     = s_in;
-        }
-    },
-    in, bi, out);
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
+
+                const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+                *(out_ptr + x)     = s_in;
+            }
+        },
+        in, bi, out);
 }
 
 // Quantized case
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nchw(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+template <
+    typename TOut,
+    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nchw(ITensor       *src,
+                       const ITensor *bias,
+                       const Window  &window,
+                       ITensor       *dst,
+                       int            result_fixedpoint_multiplier,
+                       int            result_shift,
+                       int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -242,67 +268,63 @@
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in = {{wrapper::vloadq(in_ptr), wrapper::vloadq(in_ptr + 4), wrapper::vloadq(in_ptr + 8),
+                                     wrapper::vloadq(in_ptr + 12)}};
+
+                // Accumulate bias
+                if (has_bias)
                 {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12)
+                    const auto vb = wrapper::vdup_n(
+                        *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+                    v_in = {{wrapper::vadd(v_in.val[0], vb), wrapper::vadd(v_in.val[1], vb),
+                             wrapper::vadd(v_in.val[2], vb), wrapper::vadd(v_in.val[3], vb)}};
                 }
-            };
 
-            // Accumulate bias
-            if(has_bias)
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
+            }
+
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
             {
-                const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
-                v_in =
+                // Get bias and pointer to input
+                int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+                // Accumulate bias
+                if (has_bias)
                 {
-                    {
-                        wrapper::vadd(v_in.val[0], vb),
-                        wrapper::vadd(v_in.val[1], vb),
-                        wrapper::vadd(v_in.val[2], vb),
-                        wrapper::vadd(v_in.val[3], vb)
-                    }
-                };
+                    const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+                    s_in += b;
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
             }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
-                                                           min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
-                s_in += b;
-            }
-
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
-template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
-void output_stage_nhwc(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                       int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift)
+template <
+    typename TOut,
+    typename std::enable_if<std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int>::type = 0>
+void output_stage_nhwc(ITensor       *src,
+                       const ITensor *bias,
+                       const Window  &window,
+                       ITensor       *dst,
+                       int            result_fixedpoint_multiplier,
+                       int            result_shift,
+                       int            result_offset_after_shift)
 {
     const bool has_bias = bias != nullptr;
     using VectorType    = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
@@ -329,62 +351,65 @@
     Iterator bi(bias, window_bias);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            // Get bias and pointer to input
-            const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32x4x4_t v_in =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
+                // Get bias and pointer to input
+                const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32x4x4_t v_in   = {{
+                      wrapper::vloadq(in_ptr),
+                      wrapper::vloadq(in_ptr + 4),
+                      wrapper::vloadq(in_ptr + 8),
+                      wrapper::vloadq(in_ptr + 12),
+                }};
+
+                // Accumulate bias
+                if (has_bias)
                 {
-                    wrapper::vloadq(in_ptr),
-                    wrapper::vloadq(in_ptr + 4),
-                    wrapper::vloadq(in_ptr + 8),
-                    wrapper::vloadq(in_ptr + 12),
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+
+                    wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+                    wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+                    wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+                    wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
                 }
-            };
 
-            // Accumulate bias
-            if(has_bias)
-            {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-
-                wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
-                wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
-                wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
-                wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift,
+                                                               result_offset_after_shift_s32, min, max, false));
             }
 
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
-        }
-
-        // Left-overs loop
-        for(; x < window_end_x; ++x)
-        {
-            // Get bias and pointer to input
-            const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
-            int32_t    s_in   = *in_ptr;
-
-            // Accumulate bias
-            if(has_bias)
+            // Left-overs loop
+            for (; x < window_end_x; ++x)
             {
-                const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
-                s_in += *bias_ptr;
-            }
+                // Get bias and pointer to input
+                const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+                int32_t    s_in   = *in_ptr;
 
-            const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
-            *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
-                                                       std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
-        }
-    },
-    in, bi, out);
+                // Accumulate bias
+                if (has_bias)
+                {
+                    const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+                    s_in += *bias_ptr;
+                }
+
+                const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+                *out_ptr =
+                    finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+                                          std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+            }
+        },
+        in, bi, out);
 }
 } // namespace
 
-void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void CpuDirectConv2dOutputStageKernel::configure(ITensorInfo                                       *src,
+                                                 const ITensorInfo                                 *bias,
+                                                 ITensorInfo                                       *dst,
                                                  const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(bias);
@@ -398,7 +423,7 @@
     _result_offset_after_shift    = info.result_offset_after_shift;
 
     // Auto-initialize output output if required
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // Work out expected output data type
         const DataType output_dt = (src->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
@@ -410,16 +435,17 @@
 
     ICpuKernel::configure(win);
 
-    const bool is_qasymm8_signed = (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
+    const bool is_qasymm8_signed =
+        (dst != nullptr) ? is_data_type_quantized_asymmetric_signed(dst->data_type()) : false;
 
     // Set appropriate function
-    if(src->data_layout() == DataLayout::NCHW)
+    if (src->data_layout() == DataLayout::NCHW)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::S32:
             {
-                if(is_qasymm8_signed)
+                if (is_qasymm8_signed)
                 {
                     _func = &output_stage_nchw<int8_t>;
                 }
@@ -449,11 +475,11 @@
     }
     else
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::S32:
             {
-                if(is_qasymm8_signed)
+                if (is_qasymm8_signed)
                 {
                     _func = &output_stage_nhwc<int8_t>;
                 }
@@ -483,7 +509,9 @@
     }
 }
 
-Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status CpuDirectConv2dOutputStageKernel::validate(const ITensorInfo                                 *src,
+                                                  const ITensorInfo                                 *bias,
+                                                  const ITensorInfo                                 *dst,
                                                   const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info));

diff --git a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
index d3ef17b..ce84f49 100644
--- a/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h
+++ b/src/cpu/kernels/CpuDirectConv2dOutputStageKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DIRECT_CONV2D_OUTPUT_STAGE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -55,29 +56,40 @@
      *                      Data type supported: F16/F32 or QASYMM8/QASYMM8_SIGNED if @p src is S32
      * @param[in]      info (Optional) DirectConvolutionLayerOutputStageKernel descriptor metadata
      */
-    void configure(ITensorInfo *src, const ITensorInfo *bias = nullptr, ITensorInfo *dst = nullptr,
-                   const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+    void
+    configure(ITensorInfo                                       *src,
+              const ITensorInfo                                 *bias = nullptr,
+              ITensorInfo                                       *dst  = nullptr,
+              const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv2dOutputStageKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias = nullptr, const ITensorInfo *dst = nullptr,
-                           const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
+    static Status
+    validate(const ITensorInfo                                 *src,
+             const ITensorInfo                                 *bias = nullptr,
+             const ITensorInfo                                 *dst  = nullptr,
+             const DirectConvolutionLayerOutputStageKernelInfo &info = DirectConvolutionLayerOutputStageKernelInfo());
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    using OutputStageKernel = void(ITensor *src, const ITensor *bias, const Window &window, ITensor *dst,
-                                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift);
+    using OutputStageKernel = void(ITensor       *src,
+                                   const ITensor *bias,
+                                   const Window  &window,
+                                   ITensor       *dst,
+                                   int            result_fixedpoint_multiplier,
+                                   int            result_shift,
+                                   int            result_offset_after_shift);
 
-    OutputStageKernel *_func{ nullptr };
-    int                _result_fixedpoint_multiplier{ 0 };
-    int                _result_shift{ 0 };
-    int                _result_offset_after_shift{ 0 };
+    OutputStageKernel *_func{nullptr};
+    int                _result_fixedpoint_multiplier{0};
+    int                _result_shift{0};
+    int                _result_offset_after_shift{0};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.cpp b/src/cpu/kernels/CpuDirectConv3dKernel.cpp
index 22c60cd..b5b2aed 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.cpp
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.cpp

@@ -29,12 +29,13 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/conv3d/neon/list.h"
 
 #include <algorithm>
@@ -49,43 +50,37 @@
 {
 namespace
 {
-static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels =
-{
+static const std::vector<CpuDirectConv3dKernel::DirectConv3dKernel> available_kernels = {
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    {
-        "neon_fp16_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)
-    },
+    {"neon_fp16_directconv3d",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float16_t>)},
 #endif /* !defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) */
-    {
-        "neon_fp32_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)
-    },
-    {
-        "neon_qasymm8_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)
-    },
-    {
-        "neon_qasymm8_signed_directconv3d",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)
-    }
-};
+    {"neon_fp32_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::directconv3d_float_neon_ndhwc<float>)},
+    {"neon_qasymm8_directconv3d", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<uint8_t>)},
+    {"neon_qasymm8_signed_directconv3d",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::directconv3d_quantized_neon_ndhwc<int8_t>)}};
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info)
+Status validate_arguments(const ITensorInfo *src0,
+                          const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation != Size3D(1U, 1U, 1U));
 
-    const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -96,9 +91,9 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 5);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != src0->dimension(channel_idx));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
-        if(is_data_type_quantized(src0->data_type()))
+        if (is_data_type_quantized(src0->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
         }
@@ -106,14 +101,16 @@
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+                                        "Biases size and number of dst feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+        TensorShape output_shape =
+            misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
 
         DataType data_type = src0->data_type();
 
@@ -125,12 +122,17 @@
 }
 } // namespace
 
-void CpuDirectConv3dKernel::configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info)
+void CpuDirectConv3dKernel::configure(const ITensorInfo *src0,
+                                      const ITensorInfo *src1,
+                                      const ITensorInfo *src2,
+                                      ITensorInfo       *dst,
+                                      const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_UNUSED(src2);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuDirectConv3dKernel::get_implementation(DataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa()});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -139,7 +141,8 @@
     _name       = std::string("CpuDirectConv3dKernel").append("/").append(uk->name);
 
     // Get convolved dimensions
-    TensorShape output_shape = misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
+    TensorShape output_shape =
+        misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv_info);
 
     DataType data_type = src0->data_type();
 
@@ -154,7 +157,11 @@
     ICpuKernel::configure(win);
 }
 
-Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info)
+Status CpuDirectConv3dKernel::validate(const ITensorInfo *src0,
+                                       const ITensorInfo *src1,
+                                       const ITensorInfo *src2,
+                                       const ITensorInfo *dst,
+                                       const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv_info));
 
@@ -188,4 +195,4 @@
 
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuDirectConv3dKernel.h b/src/cpu/kernels/CpuDirectConv3dKernel.h
index 688f368..8e6f564 100644
--- a/src/cpu/kernels/CpuDirectConv3dKernel.h
+++ b/src/cpu/kernels/CpuDirectConv3dKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DIRECT_CONV3D_KERNEL_H
 
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,7 +40,8 @@
 {
 private:
     /* Template function for convolution 3d NDHWC */
-    using DirectConv3dKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
+    using DirectConv3dKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, const ITensor *, ITensor *, const Conv3dInfo &, const Window &)>::type;
 
 public:
     CpuDirectConv3dKernel() = default;
@@ -63,17 +65,25 @@
      * @param[in]      conv_info Contains padding, stride, acitvation information.
      *
      */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv_info);
+    void configure(const ITensorInfo *src0,
+                   const ITensorInfo *src1,
+                   const ITensorInfo *src2,
+                   ITensorInfo       *dst,
+                   const Conv3dInfo  &conv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv3dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct DirectConv3dKernel
@@ -87,7 +97,7 @@
 
 private:
     Conv3dInfo            _conv_info{};
-    DirectConv3dKernelPtr _run_method{ nullptr };
+    DirectConv3dKernelPtr _run_method{nullptr};
     std::string           _name{};
 };
 

diff --git a/src/cpu/kernels/CpuElementwiseKernel.cpp b/src/cpu/kernels/CpuElementwiseKernel.cpp
index a045855..57a3f39 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseKernel.cpp

@@ -24,8 +24,9 @@
 #include "src/cpu/kernels/CpuElementwiseKernel.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/elementwise_binary/list.h"
@@ -35,11 +36,11 @@
 #if defined(ENABLE_FP32_KERNELS)
 namespace
 {
-    static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
-    static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
-    static constexpr size_t default_div_mws_N1_fp32_neon = 19043;
-    static constexpr size_t default_div_mws_V1_fp32_neon = 25511;
-}
+static constexpr size_t default_min_max_mws_N1_fp32_neon = 25308;
+static constexpr size_t default_min_max_mws_V1_fp32_neon = 34772;
+static constexpr size_t default_div_mws_N1_fp32_neon     = 19043;
+static constexpr size_t default_div_mws_V1_fp32_neon     = 25511;
+} // namespace
 #endif /* ENABLE_FP32_KERNELS */
 
 namespace arm_compute
@@ -50,255 +51,178 @@
 {
 namespace
 {
-template <ArithmeticOperation                                                   op>
-const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic =
-{
-    {
-        "sve2_qu8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)
-    },
-    {
-        "sve2_qs8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)
-    },
-    {
-        "sve_fp32_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)
-    },
-    {
-        "sve_s32_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)
-    },
-    {
-        "sve_s16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)
-    },
-    {
-        "sve_fp16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)
-    },
-    {
-        "neon_fp32_arithmetic",
+template <ArithmeticOperation op>
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels_arithmetic = {
+    {"sve2_qu8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_elementwise_binary<op>)},
+    {"sve2_qs8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ArithmeticOperation>(data.op) == op;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_elementwise_binary<op>)},
+    {"sve_fp32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP32_SVE(sve_fp32_elementwise_binary<op>)},
+    {"sve_s32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s32_elementwise_binary<op>)},
+    {"sve_s16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s16_elementwise_binary<op>)},
+    {"sve_fp16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                static_cast<ArithmeticOperation>(data.op) == op;
+     },
+     REGISTER_FP16_SVE(sve_fp16_elementwise_binary<op>)},
+    {"neon_fp32_arithmetic",
 
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)
-    },
-    {
-        "neon_s32_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)
-    },
-    {
-        "neon_fp16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)
-    },
-    {
-        "neon_s16_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)
-    },
-    {
-        "neon_qu8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)
-    },
-    {
-        "neon_qs8_arithmetic",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)
-    },
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP32_NEON(neon_fp32_elementwise_binary<op>)},
+    {"neon_s32_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s32_elementwise_binary<op>)},
+    {"neon_fp16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_FP16_NEON(neon_fp16_elementwise_binary<op>)},
+    {"neon_s16_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s16_elementwise_binary<op>)},
+    {"neon_qu8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_binary<op>)},
+    {"neon_qs8_arithmetic",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ArithmeticOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_binary<op>)},
 };
-template <ComparisonOperation                                                   op>
-const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison =
-{
-    {
-        "sve2_qu8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve2_qs8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_u8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_fp32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_s16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_s32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)
-    },
-    {
-        "sve_fp16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_u8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_fp32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_s16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_s32_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_qu8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_qs8_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)
-    },
-    {
-        "neon_fp16_comparison",
-        [](const ElementwiseDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op;
-        },
-        REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)
-    },
+template <ComparisonOperation op>
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels_comperison = {
+    {"sve2_qu8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_comparison_elementwise_binary<op>)},
+    {"sve2_qs8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve2 && static_cast<ComparisonOperation>(data.op) == op;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_comparison_elementwise_binary<op>)},
+    {"sve_u8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_u8_comparison_elementwise_binary<op>)},
+    {"sve_fp32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP32_SVE(sve_fp32_comparison_elementwise_binary<op>)},
+    {"sve_s16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s16_comparison_elementwise_binary<op>)},
+    {"sve_s32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && data.isa.sve && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_SVE(sve_s32_comparison_elementwise_binary<op>)},
+    {"sve_fp16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                static_cast<ComparisonOperation>(data.op) == op;
+     },
+     REGISTER_FP16_SVE(sve_fp16_comparison_elementwise_binary<op>)},
+    {"neon_u8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_u8_comparison_elementwise_binary<op>)},
+    {"neon_fp32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP32_NEON(neon_fp32_comparison_elementwise_binary<op>)},
+    {"neon_s16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s16_comparison_elementwise_binary<op>)},
+    {"neon_s32_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::S32 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_INTEGER_NEON(neon_s32_comparison_elementwise_binary<op>)},
+    {"neon_qu8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_comparison_elementwise_binary<op>)},
+    {"neon_qs8_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::QASYMM8_SIGNED && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_comparison_elementwise_binary<op>)},
+    {"neon_fp16_comparison",
+     [](const ElementwiseDataTypeISASelectorData &data)
+     { return data.dt == DataType::F16 && data.isa.fp16 && static_cast<ComparisonOperation>(data.op) == op; },
+     REGISTER_FP16_NEON(neon_fp16_comparison_elementwise_binary<op>)},
 };
 } // namespace
 
-const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &CpuArithmeticKernel::get_available_kernels()
+const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &
+CpuArithmeticKernel::get_available_kernels()
 {
     static std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> available_kernels;
-    std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(), available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(), available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(), available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(), available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(), available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(), available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(), available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(), available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::ADD>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::ADD>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::SUB>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::SUB>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::DIV>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::DIV>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::MIN>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::MIN>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::MAX>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::MAX>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::SQUARED_DIFF>.end(),
+              std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::POWER>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::POWER>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_arithmetic<ArithmeticOperation::PRELU>.begin(),
+              available_kernels_arithmetic<ArithmeticOperation::PRELU>.end(), std::back_inserter(available_kernels));
 
     return available_kernels;
 }
 
-const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &CpuComparisonKernel::get_available_kernels()
+const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &
+CpuComparisonKernel::get_available_kernels()
 {
     static std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> available_kernels;
-    std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(), available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(), available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(), available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(), available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(), available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels));
-    std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(), available_kernels_comperison<ComparisonOperation::LessEqual>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Equal>.begin(),
+              available_kernels_comperison<ComparisonOperation::Equal>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::NotEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::NotEqual>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Greater>.begin(),
+              available_kernels_comperison<ComparisonOperation::Greater>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::GreaterEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::GreaterEqual>.end(),
+              std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::Less>.begin(),
+              available_kernels_comperison<ComparisonOperation::Less>.end(), std::back_inserter(available_kernels));
+    std::move(available_kernels_comperison<ComparisonOperation::LessEqual>.begin(),
+              available_kernels_comperison<ComparisonOperation::LessEqual>.end(),
+              std::back_inserter(available_kernels));
 
     return available_kernels;
 }
 
 template <class Derived>
-Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
+Status CpuElementwiseKernel<Derived>::validate_arguments_common(const ITensorInfo &src0,
+                                                                const ITensorInfo &src1,
+                                                                const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
@@ -308,7 +232,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
                                         "Wrong shape for output");
@@ -321,7 +245,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = CpuArithmeticKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) });
+    const auto *uk = CpuArithmeticKernel::get_implementation(
+        ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
 
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -329,7 +254,7 @@
     _name       = std::string("CpuArithmeticKernel").append("/").append(uk->name);
 
     // If any of shapes is dynamic, expect a configured window and dst at run-time.
-    if(src0->is_dynamic() || src1->is_dynamic())
+    if (src0->is_dynamic() || src1->is_dynamic())
     {
         return;
     }
@@ -343,7 +268,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    const auto *uk = CpuComparisonKernel::get_implementation(ElementwiseDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op) });
+    const auto *uk = CpuComparisonKernel::get_implementation(
+        ElementwiseDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), static_cast<int>(_op)});
 
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -351,7 +277,7 @@
     _name       = std::string("CpuComparisonKernel").append("/").append(uk->name);
 
     // If any of shapes is dynamic, expect a configured window and dst at run-time.
-    if(src0->is_dynamic() || src1->is_dynamic())
+    if (src0->is_dynamic() || src1->is_dynamic())
     {
         return;
     }
@@ -373,8 +299,10 @@
 
     _run_method(src0, src1, dst, window);
 }
-template void CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
-template void CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+template void
+CpuElementwiseKernel<CpuArithmeticKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
+template void
+CpuElementwiseKernel<CpuComparisonKernel>::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info);
 
 template <class Derived>
 const char *CpuElementwiseKernel<Derived>::name() const
@@ -385,7 +313,10 @@
 template const char *CpuElementwiseKernel<CpuComparisonKernel>::name() const;
 
 /** Arithmetic operators (min, max, squared_diff) */
-void CpuArithmeticKernel::configure(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+void CpuArithmeticKernel::configure(ArithmeticOperation op,
+                                    const ITensorInfo  *src0,
+                                    const ITensorInfo  *src1,
+                                    ITensorInfo        *dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
     _op = op;
@@ -394,16 +325,20 @@
 
 Status CpuArithmeticKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S16, DataType::F16, DataType::S32, DataType::F32);
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
     }
     return validate_arguments_common(src0, src1, dst);
 }
 
-Status CpuArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status CpuArithmeticKernel::validate(ArithmeticOperation op,
+                                     const ITensorInfo  *src0,
+                                     const ITensorInfo  *src1,
+                                     const ITensorInfo  *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
@@ -416,15 +351,15 @@
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN>
-    || this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
+    if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MIN> ||
+        this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::MAX>)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_min_max_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_min_max_mws_V1_fp32_neon;
         }
@@ -434,7 +369,7 @@
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -447,7 +382,7 @@
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
@@ -467,14 +402,14 @@
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
+    if (this->_run_method == &neon_fp32_elementwise_binary<ArithmeticOperation::DIV>)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_div_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_div_mws_V1_fp32_neon;
         }
@@ -484,7 +419,7 @@
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -497,7 +432,7 @@
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
     return ICPPKernel::default_mws;
@@ -538,7 +473,10 @@
 }
 
 /** Comparison operators (equal, not equal, less than, greater than, less than or equal, greater than or equal) */
-void CpuComparisonKernel::configure(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst)
+void CpuComparisonKernel::configure(ComparisonOperation op,
+                                    const ITensorInfo  *src0,
+                                    const ITensorInfo  *src1,
+                                    ITensorInfo        *dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src0, *src1, *dst));
     _op = op;
@@ -547,16 +485,21 @@
 
 Status CpuComparisonKernel::validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::F16,
+                                                         DataType::S32, DataType::F32);
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::U8);
     }
     return validate_arguments_common(src0, src1, dst);
 }
 
-Status CpuComparisonKernel::validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status CpuComparisonKernel::validate(ComparisonOperation op,
+                                     const ITensorInfo  *src0,
+                                     const ITensorInfo  *src1,
+                                     const ITensorInfo  *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);

diff --git a/src/cpu/kernels/CpuElementwiseKernel.h b/src/cpu/kernels/CpuElementwiseKernel.h
index 634e38b..1f3e613 100644
--- a/src/cpu/kernels/CpuElementwiseKernel.h
+++ b/src/cpu/kernels/CpuElementwiseKernel.h

@@ -43,7 +43,8 @@
 class CpuElementwiseKernel : public ICpuKernel<Derived>
 {
 private:
-    using ElementwiseKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
+    using ElementwiseKernelPtr =
+        std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &)>::type;
 
 public:
     CpuElementwiseKernel() = default;
@@ -72,7 +73,7 @@
     static Status validate_arguments_common(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst);
 
 protected:
-    ElementwiseKernelPtr _run_method{ nullptr };
+    ElementwiseKernelPtr _run_method{nullptr};
     std::string          _name{};
 };
 
@@ -96,7 +97,8 @@
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+    static Status
+    validate(ArithmeticOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
     static const std::vector<CpuElementwiseKernel<CpuArithmeticKernel>::ElementwiseKernel> &get_available_kernels();
 
@@ -200,7 +202,8 @@
      *
      * @return a status
      */
-    static Status validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
+    static Status
+    validate(ComparisonOperation op, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
     static const std::vector<CpuElementwiseKernel<CpuComparisonKernel>::ElementwiseKernel> &get_available_kernels();
 
@@ -226,4 +229,4 @@
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_KERNEL_H */

diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
index 04a7f15..88545ee 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.cpp

@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/elementwise_unary/list.h"
@@ -59,12 +60,13 @@
     const auto dst_min_fp = (((is_signed) ? -128 : 0) - dst_qi.offset) * dst_qi.scale;
     const auto dst_max_fp = (((is_signed) ? 127 : 255) - dst_qi.offset) * dst_qi.scale;
 
-    for(int i = 0; i < 256; ++i)
+    for (int i = 0; i < 256; ++i)
     {
-        const auto in     = (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
-        float      result = 0;
+        const auto in =
+            (is_signed) ? dequantize_qasymm8_signed(static_cast<int8_t>(i), src_qi) : dequantize_qasymm8(i, src_qi);
+        float result = 0;
 
-        switch(op)
+        switch (op)
         {
             case ElementWiseUnary::RSQRT:
                 result = 1 / sqrt(in);
@@ -100,7 +102,8 @@
 
         result = utility::clamp(result, dst_min_fp, dst_max_fp);
 
-        const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi)) : quantize_qasymm8(result, dst_qi);
+        const auto out = (is_signed) ? static_cast<uint8_t>(quantize_qasymm8_signed(result, dst_qi))
+                                     : quantize_qasymm8(result, dst_qi);
         lut[i]         = out;
     }
 
@@ -109,97 +112,68 @@
 
 #endif // __aarch64__
 
-static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels =
-{
+static const std::vector<CpuElementwiseUnaryKernel::ElementwiseUnaryKernel> available_kernels = {
     {
         "sve_fp32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32 && data.isa.sve);
-        },
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32 && data.isa.sve); },
         REGISTER_FP32_SVE(sve_fp32_elementwise_unary),
         nullptr,
     },
     {
         "sve_fp16_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16);
-        },
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.sve && data.isa.fp16); },
         REGISTER_FP16_SVE(sve_fp16_elementwise_unary),
         nullptr,
     },
     {
         "sve_s32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::S32 && data.isa.sve);
-        },
+        [](const DataTypeISASelectorData &data) { return (data.dt == DataType::S32 && data.isa.sve); },
         REGISTER_INTEGER_SVE(sve_s32_elementwise_unary),
         nullptr,
     },
     {
         "neon_fp32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
         REGISTER_FP32_NEON(neon_fp32_elementwise_unary),
         nullptr,
     },
     {
         "neon_fp16_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.fp16;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
         REGISTER_FP16_NEON(neon_fp16_elementwise_unary),
         nullptr,
     },
     {
         "neon_s32_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S32;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::S32; },
         REGISTER_INTEGER_NEON(neon_s32_elementwise_unary),
         nullptr,
     },
 #ifdef __aarch64__
     {
         "sve2_q8_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2;
-        },
+        [](const DataTypeISASelectorData &data)
+        { return (data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
         REGISTER_QASYMM8_SVE2(sve2_q8_elementwise_unary),
         &q8_prepare_lut,
     },
     {
         "neon_q8_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED;
-        },
+        [](const DataTypeISASelectorData &data)
+        { return data.dt == DataType::QASYMM8 || data.dt == DataType::QASYMM8_SIGNED; },
         REGISTER_QASYMM8_NEON(neon_q8_elementwise_unary),
         &q8_prepare_lut,
     },
 #else  // __aarch64__
     {
         "neon_qasymm8_signed_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
         REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_signed_elementwise_unary),
         nullptr,
     },
     {
         "neon_qasymm8_elementwise_unary",
-        [](const DataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8;
-        },
+        [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
         REGISTER_QASYMM8_NEON(neon_qasymm8_elementwise_unary),
         nullptr,
     },
@@ -211,7 +185,8 @@
 void CpuElementwiseUnaryKernel::configure(ElementWiseUnary op, const ITensorInfo &src, ITensorInfo &dst)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(op, src, dst));
-    const auto uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuElementwiseUnaryKernel::get_implementation(
+        DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     _op         = op;
@@ -219,12 +194,12 @@
     _name       = std::string("CpuElementwiseUnaryKernel").append("/").append(uk->name);
 
     // If input shape is dynamic, expect a configured window and dst at run-time.
-    if(src.is_dynamic())
+    if (src.is_dynamic())
     {
         return;
     }
 
-    if(uk->prepare_func != nullptr)
+    if (uk->prepare_func != nullptr)
     {
         _lut = uk->prepare_func(op, &src, &dst);
     }
@@ -238,28 +213,31 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
 
-    const auto *uk = CpuElementwiseUnaryKernel::get_implementation(DataTypeISASelectorData{ src.data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = CpuElementwiseUnaryKernel::get_implementation(
+        DataTypeISASelectorData{src.data_type(), CPUInfo::get().get_isa()});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::EXP:
         case ElementWiseUnary::RSQRT:
         case ElementWiseUnary::LOG:
         case ElementWiseUnary::ROUND:
         case ElementWiseUnary::SIN:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32,
+                                                                 DataType::QASYMM8, DataType::QASYMM8_SIGNED);
             break;
         case ElementWiseUnary::NEG:
         case ElementWiseUnary::ABS:
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32,
+                                                                 DataType::QASYMM8, DataType::QASYMM8_SIGNED);
             break;
         default:
             ARM_COMPUTE_ERROR("ElementWiseUnary operation not supported");
     }
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
     }

diff --git a/src/cpu/kernels/CpuElementwiseUnaryKernel.h b/src/cpu/kernels/CpuElementwiseUnaryKernel.h
index 00188f0..2499098 100644
--- a/src/cpu/kernels/CpuElementwiseUnaryKernel.h
+++ b/src/cpu/kernels/CpuElementwiseUnaryKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -42,8 +43,10 @@
 class CpuElementwiseUnaryKernel : public ICpuKernel<CpuElementwiseUnaryKernel>
 {
 private:
-    using ElementwiseUnaryUkernelPtr = std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type;
-    using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type;
+    using ElementwiseUnaryUkernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, const Window &, ElementWiseUnary, const uint8_t *)>::type;
+    using ElementwiseUnaryPreparePtr = std::add_pointer<std::unique_ptr<uint8_t[]>(
+        ElementWiseUnary op, const ITensorInfo *, const ITensorInfo *)>::type;
 
 public:
     CpuElementwiseUnaryKernel() = default;
@@ -65,7 +68,7 @@
     static Status validate(ElementWiseUnary op, const ITensorInfo &src, const ITensorInfo &dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct ElementwiseUnaryKernel
@@ -80,7 +83,7 @@
 
 private:
     ElementWiseUnary           _op{};
-    ElementwiseUnaryUkernelPtr _run_method{ nullptr };
+    ElementwiseUnaryUkernelPtr _run_method{nullptr};
     std::string                _name{};
     std::unique_ptr<uint8_t[]> _lut{};
 };

diff --git a/src/cpu/kernels/CpuFillKernel.cpp b/src/cpu/kernels/CpuFillKernel.cpp
index f69de00..754da97 100644
--- a/src/cpu/kernels/CpuFillKernel.cpp
+++ b/src/cpu/kernels/CpuFillKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -68,17 +69,18 @@
     collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator tensor_it(inout, collapsed);
-    execute_window_loop(collapsed, [&](const Coordinates &)
-    {
-        uint8_t *base_addr = start_valid_region + tensor_it.offset();
-        // Set memory
-        for(int i = 0; i < window_width; ++i)
+    execute_window_loop(
+        collapsed,
+        [&](const Coordinates &)
         {
-            std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
-        }
-
-    },
-    tensor_it);
+            uint8_t *base_addr = start_valid_region + tensor_it.offset();
+            // Set memory
+            for (int i = 0; i < window_width; ++i)
+            {
+                std::memcpy(base_addr + i * element_size, &_constant_value.value, element_size);
+            }
+        },
+        tensor_it);
 }
 
 const char *CpuFillKernel::name() const

diff --git a/src/cpu/kernels/CpuFillKernel.h b/src/cpu/kernels/CpuFillKernel.h
index ce41afc..7c200c9 100644
--- a/src/cpu/kernels/CpuFillKernel.h
+++ b/src/cpu/kernels/CpuFillKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_FILL_KERNEL_H
 
 #include "arm_compute/core/PixelValue.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -48,7 +49,7 @@
     void configure(const ITensorInfo *tensor, const PixelValue &constant_value);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:

diff --git a/src/cpu/kernels/CpuFloorKernel.cpp b/src/cpu/kernels/CpuFloorKernel.cpp
index 65e390a..df7e6aa 100644
--- a/src/cpu/kernels/CpuFloorKernel.cpp
+++ b/src/cpu/kernels/CpuFloorKernel.cpp

@@ -27,11 +27,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/floor/list.h"
 
 namespace arm_compute
@@ -42,29 +42,22 @@
 {
 namespace
 {
-static const std::vector<CpuFloorKernel::FloorKernel> available_kernels =
-{
-    {
-        "neon_fp16_floor",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)
-    },
-    {
-        "neon_fp32_floor",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)
-    }
-};
+static const std::vector<CpuFloorKernel::FloorKernel> available_kernels = {
+    {"neon_fp16_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::fp16_neon_floor)},
+    {"neon_fp32_floor", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::fp32_neon_floor)}};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
-    const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     // Validate in case of configured output
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -81,7 +74,8 @@
 
     auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type());
 
-    const auto *uk = CpuFloorKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuFloorKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _run_method = uk->ukernel;
@@ -122,17 +116,14 @@
     ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
     const auto     len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src_it(src, win);
     Iterator dst_it(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        _run_method(src_it.ptr(), dst_it.ptr(), len);
-    },
-    src_it, dst_it);
+    execute_window_loop(
+        win, [&](const Coordinates &) { _run_method(src_it.ptr(), dst_it.ptr(), len); }, src_it, dst_it);
 }
 
 const char *CpuFloorKernel::name() const

diff --git a/src/cpu/kernels/CpuFloorKernel.h b/src/cpu/kernels/CpuFloorKernel.h
index 35ab534..57107d0 100644
--- a/src/cpu/kernels/CpuFloorKernel.h
+++ b/src/cpu/kernels/CpuFloorKernel.h

@@ -65,7 +65,7 @@
     Window infer_window(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct FloorKernel
@@ -78,7 +78,7 @@
     static const std::vector<FloorKernel> &get_available_kernels();
 
 private:
-    FloorKernelPtr _run_method{ nullptr };
+    FloorKernelPtr _run_method{nullptr};
     std::string    _name{};
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
index 9fbf2d5..db433c9 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.cpp

@@ -24,9 +24,10 @@
 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -60,7 +61,7 @@
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = compute_interleaved_shape(*src);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -111,35 +112,42 @@
     Iterator in(src, win);
     Iterator out(dst, win_out);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        if(id.y() + 4 <= static_cast<int>(in_height))
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
+            if (id.y() + 4 <= static_cast<int>(in_height))
             {
-                std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size, element_size);
-                std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size, element_size);
-            }
-        }
-        else
-        {
-            for(size_t x = window_start_x; x < window_end_x; ++x)
-            {
-                size_t y = 0;
-                for(; y < partial_y; ++y)
+                for (size_t x = window_start_x; x < window_end_x; ++x)
                 {
-                    std::memcpy(out.ptr() + (x * 4 + y) * element_size, (in.ptr() + y * in_stride) + x * element_size, element_size);
-                }
-                for(; y < 4; ++y)
-                {
-                    std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 0) * element_size, (in.ptr() + 0 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 1) * element_size, (in.ptr() + 1 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 2) * element_size, (in.ptr() + 2 * in_stride) + x * element_size,
+                                element_size);
+                    std::memcpy(out.ptr() + (x * 4 + 3) * element_size, (in.ptr() + 3 * in_stride) + x * element_size,
+                                element_size);
                 }
             }
-        }
-    },
-    in, out);
+            else
+            {
+                for (size_t x = window_start_x; x < window_end_x; ++x)
+                {
+                    size_t y = 0;
+                    for (; y < partial_y; ++y)
+                    {
+                        std::memcpy(out.ptr() + (x * 4 + y) * element_size,
+                                    (in.ptr() + y * in_stride) + x * element_size, element_size);
+                    }
+                    for (; y < 4; ++y)
+                    {
+                        std::memset(out.ptr() + (x * 4 + y) * element_size, 0, element_size);
+                    }
+                }
+            }
+        },
+        in, out);
 }
 
 const char *CpuGemmInterleave4x4Kernel::name() const

diff --git a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
index 4fb6a52..2ce34bc 100644
--- a/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h
+++ b/src/cpu/kernels/CpuGemmInterleave4x4Kernel.h

@@ -71,7 +71,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
index f8bef64..a3ed2cd 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -44,646 +45,494 @@
 {
 namespace
 {
-void inline vector_matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_u8(Iterator     &ina,
+                                      Iterator     &inb,
+                                      Iterator     &out,
+                                      int           width_a,
+                                      int           width_b,
+                                      int           width_out,
+                                      size_t        stride_b,
+                                      const Window &window)
 {
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            return;
-        }
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
-        {
+            if (id.x() > width_b)
             {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
+                return;
             }
-        };
 
-        auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
 
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
-            const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
-            const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
-            const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
-            const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
-            const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
-            const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
-            const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
+            auto vec_a          = reinterpret_cast<const uint8_t *>(ina.ptr());
+            auto matrix_b       = reinterpret_cast<const uint8_t *>(inb.ptr());
+            auto vec_a_end_addr = vec_a + width_a;
 
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4x2_t a00_u16 =
+            // This for loop performs 8 accumulations
+            for (; vec_a <= (vec_a_end_addr - 8);)
             {
-                {
-                    vget_low_u16(vmovl_u8(a00_u8)),
-                    vget_high_u16(vmovl_u8(a00_u8))
-                }
-            };
+                const uint8x8_t  a00_u8 = vld1_u8(vec_a);
+                const uint8x16_t b00_u8 = vld1q_u8(matrix_b + 0 * stride_b);
+                const uint8x16_t b10_u8 = vld1q_u8(matrix_b + 1 * stride_b);
+                const uint8x16_t b20_u8 = vld1q_u8(matrix_b + 2 * stride_b);
+                const uint8x16_t b30_u8 = vld1q_u8(matrix_b + 3 * stride_b);
+                const uint8x16_t b40_u8 = vld1q_u8(matrix_b + 4 * stride_b);
+                const uint8x16_t b50_u8 = vld1q_u8(matrix_b + 5 * stride_b);
+                const uint8x16_t b60_u8 = vld1q_u8(matrix_b + 6 * stride_b);
+                const uint8x16_t b70_u8 = vld1q_u8(matrix_b + 7 * stride_b);
 
-            const uint16x4x4_t b00_u16 =
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4x2_t a00_u16 = {{vget_low_u16(vmovl_u8(a00_u8)), vget_high_u16(vmovl_u8(a00_u8))}};
+
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                const uint16x4x4_t b10_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))}};
+
+                const uint16x4x4_t b20_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))}};
+
+                const uint16x4x4_t b30_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))}};
+
+                const uint16x4x4_t b40_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))}};
+
+                const uint16x4x4_t b50_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))}};
+
+                const uint16x4x4_t b60_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))}};
+
+                const uint16x4x4_t b70_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))}};
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
+
+                // Accumulate 1:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
+
+                // Accumulate 2:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
+
+                // Accumulate 3:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
+
+                // Accumulate 4:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
+
+                // Accumulate 5:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
+
+                // Accumulate 6:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
+
+                // Accumulate 7:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
+
+                vec_a += 8;
+                matrix_b += 8 * stride_b;
+            }
+
+            // This for loop performs the left-over accumulations
+            for (; vec_a < vec_a_end_addr;)
             {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
+                const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
+                const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
 
-            const uint16x4x4_t b10_u16 =
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+                vec_a += 1;
+                matrix_b += stride_b;
+            }
+
+            auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() < (width_out - 16))
             {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b10_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b10_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b10_u8)))
-                }
-            };
-
-            const uint16x4x4_t b20_u16 =
+                vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
+                vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
+                vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
+                vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
+            }
+            else
             {
+                auto left_over = width_out - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b20_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b20_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b20_u8)))
-                }
-            };
-
-            const uint16x4x4_t b30_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b30_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b30_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b30_u8)))
-                }
-            };
-
-            const uint16x4x4_t b40_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b40_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b40_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b40_u8)))
-                }
-            };
-
-            const uint16x4x4_t b50_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b50_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b50_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b50_u8)))
-                }
-            };
-
-            const uint16x4x4_t b60_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b60_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b60_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b60_u8)))
-                }
-            };
-
-            const uint16x4x4_t b70_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b70_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b70_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b70_u8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16.val[0], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16.val[0], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16.val[0], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b10_u16.val[0], a00_u16.val[0], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b10_u16.val[1], a00_u16.val[0], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b10_u16.val[2], a00_u16.val[0], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b10_u16.val[3], a00_u16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b20_u16.val[0], a00_u16.val[0], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b20_u16.val[1], a00_u16.val[0], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b20_u16.val[2], a00_u16.val[0], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b20_u16.val[3], a00_u16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b30_u16.val[0], a00_u16.val[0], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b30_u16.val[1], a00_u16.val[0], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b30_u16.val[2], a00_u16.val[0], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b30_u16.val[3], a00_u16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b40_u16.val[0], a00_u16.val[1], 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b40_u16.val[1], a00_u16.val[1], 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b40_u16.val[2], a00_u16.val[1], 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b40_u16.val[3], a00_u16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b50_u16.val[0], a00_u16.val[1], 1);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b50_u16.val[1], a00_u16.val[1], 1);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b50_u16.val[2], a00_u16.val[1], 1);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b50_u16.val[3], a00_u16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b60_u16.val[0], a00_u16.val[1], 2);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b60_u16.val[1], a00_u16.val[1], 2);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b60_u16.val[2], a00_u16.val[1], 2);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b60_u16.val[3], a00_u16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b70_u16.val[0], a00_u16.val[1], 3);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b70_u16.val[1], a00_u16.val[1], 3);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b70_u16.val[2], a00_u16.val[1], 3);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b70_u16.val[3], a00_u16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
-
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const uint8x8_t  a00_u8 = vld1_dup_u8(vec_a);
-            const uint8x16_t b00_u8 = vld1q_u8(matrix_b);
-
-            const uint16x4x4_t b00_u16 =
-            {
-                {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(vec_out + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(vec_out + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(vec_out + 12, vreinterpretq_s32_u32(c0.val[3]));
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vec_out + k * 4 + j) = c0.val[k][j];
+                    }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void inline vector_matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_a, int width_b, int width_out, size_t stride_b, const Window &window)
+void inline vector_matrix_multiply_s8(Iterator     &ina,
+                                      Iterator     &inb,
+                                      Iterator     &out,
+                                      int           width_a,
+                                      int           width_b,
+                                      int           width_out,
+                                      size_t        stride_b,
+                                      const Window &window)
 {
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        if(id.x() > width_b)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            return;
-        }
-
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
-        {
+            if (id.x() > width_b)
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+                return;
             }
-        };
 
-        auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
-        auto vec_a_end_addr = vec_a + width_a;
+            // Accumulators for the block 0
+            int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
 
-        // This for loop performs 8 accumulations
-        for(; vec_a <= (vec_a_end_addr - 8);)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
-            const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
-            const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
-            const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
-            const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
-            const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
-            const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
-            const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
+            auto vec_a          = reinterpret_cast<const int8_t *>(ina.ptr());
+            auto matrix_b       = reinterpret_cast<const int8_t *>(inb.ptr());
+            auto vec_a_end_addr = vec_a + width_a;
 
-            // Convert a00_s8 to int16_t and get the lower part
-            const int16x4x2_t a00_s16 =
+            // This for loop performs 8 accumulations
+            for (; vec_a <= (vec_a_end_addr - 8);)
             {
-                {
-                    vget_low_s16(vmovl_s8(a00_s8)),
-                    vget_high_s16(vmovl_s8(a00_s8))
-                }
-            };
+                const int8x8_t  a00_s8 = vld1_s8(vec_a);
+                const int8x16_t b00_s8 = vld1q_s8(matrix_b + 0 * stride_b);
+                const int8x16_t b10_s8 = vld1q_s8(matrix_b + 1 * stride_b);
+                const int8x16_t b20_s8 = vld1q_s8(matrix_b + 2 * stride_b);
+                const int8x16_t b30_s8 = vld1q_s8(matrix_b + 3 * stride_b);
+                const int8x16_t b40_s8 = vld1q_s8(matrix_b + 4 * stride_b);
+                const int8x16_t b50_s8 = vld1q_s8(matrix_b + 5 * stride_b);
+                const int8x16_t b60_s8 = vld1q_s8(matrix_b + 6 * stride_b);
+                const int8x16_t b70_s8 = vld1q_s8(matrix_b + 7 * stride_b);
 
-            const int16x4x4_t b00_s16 =
+                // Convert a00_s8 to int16_t and get the lower part
+                const int16x4x2_t a00_s16 = {{vget_low_s16(vmovl_s8(a00_s8)), vget_high_s16(vmovl_s8(a00_s8))}};
+
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                const int16x4x4_t b10_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))}};
+
+                const int16x4x4_t b20_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))}};
+
+                const int16x4x4_t b30_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))}};
+
+                const int16x4x4_t b40_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))}};
+
+                const int16x4x4_t b50_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))}};
+
+                const int16x4x4_t b60_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))}};
+
+                const int16x4x4_t b70_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))}};
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
+
+                // Accumulate 1:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
+
+                // Accumulate 2:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
+
+                // Accumulate 3:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
+
+                // Accumulate 4:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
+
+                // Accumulate 5:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
+
+                // Accumulate 6:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
+
+                // Accumulate 7:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
+
+                vec_a += 8;
+                matrix_b += 8 * stride_b;
+            }
+
+            // This for loop performs the left-over accumulations
+            for (; vec_a < vec_a_end_addr;)
             {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
+                const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
+                const int8x16_t b00_s8 = vld1q_s8(matrix_b);
 
-            const int16x4x4_t b10_s16 =
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                // Convert a00_s8 to uint16_t and get the lower part
+                const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+                // Accumulate 0:
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+                vec_a += 1;
+                matrix_b += stride_b;
+            }
+
+            auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() < (width_out - 16))
             {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b10_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b10_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b10_s8)))
-                }
-            };
-
-            const int16x4x4_t b20_s16 =
+                vst1q_s32(vec_out + 0, c0.val[0]);
+                vst1q_s32(vec_out + 4, c0.val[1]);
+                vst1q_s32(vec_out + 8, c0.val[2]);
+                vst1q_s32(vec_out + 12, c0.val[3]);
+            }
+            else
             {
+                auto left_over = width_out - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b20_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b20_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b20_s8)))
-                }
-            };
-
-            const int16x4x4_t b30_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b30_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b30_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b30_s8)))
-                }
-            };
-
-            const int16x4x4_t b40_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b40_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b40_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b40_s8)))
-                }
-            };
-
-            const int16x4x4_t b50_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b50_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b50_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b50_s8)))
-                }
-            };
-
-            const int16x4x4_t b60_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b60_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b60_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b60_s8)))
-                }
-            };
-
-            const int16x4x4_t b70_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b70_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b70_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b70_s8)))
-                }
-            };
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16.val[0], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16.val[0], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16.val[0], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16.val[0], 0);
-
-            // Accumulate 1:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b10_s16.val[0], a00_s16.val[0], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b10_s16.val[1], a00_s16.val[0], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b10_s16.val[2], a00_s16.val[0], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b10_s16.val[3], a00_s16.val[0], 1);
-
-            // Accumulate 2:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b20_s16.val[0], a00_s16.val[0], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b20_s16.val[1], a00_s16.val[0], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b20_s16.val[2], a00_s16.val[0], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b20_s16.val[3], a00_s16.val[0], 2);
-
-            // Accumulate 3:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b30_s16.val[0], a00_s16.val[0], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b30_s16.val[1], a00_s16.val[0], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b30_s16.val[2], a00_s16.val[0], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b30_s16.val[3], a00_s16.val[0], 3);
-
-            // Accumulate 4:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b40_s16.val[0], a00_s16.val[1], 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b40_s16.val[1], a00_s16.val[1], 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b40_s16.val[2], a00_s16.val[1], 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b40_s16.val[3], a00_s16.val[1], 0);
-
-            // Accumulate 5:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b50_s16.val[0], a00_s16.val[1], 1);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b50_s16.val[1], a00_s16.val[1], 1);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b50_s16.val[2], a00_s16.val[1], 1);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b50_s16.val[3], a00_s16.val[1], 1);
-
-            // Accumulate 6:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b60_s16.val[0], a00_s16.val[1], 2);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b60_s16.val[1], a00_s16.val[1], 2);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b60_s16.val[2], a00_s16.val[1], 2);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b60_s16.val[3], a00_s16.val[1], 2);
-
-            // Accumulate 7:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b70_s16.val[0], a00_s16.val[1], 3);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b70_s16.val[1], a00_s16.val[1], 3);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b70_s16.val[2], a00_s16.val[1], 3);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b70_s16.val[3], a00_s16.val[1], 3);
-
-            vec_a += 8;
-            matrix_b += 8 * stride_b;
-        }
-
-        // This for loop performs the left-over accumulations
-        for(; vec_a < vec_a_end_addr;)
-        {
-            const int8x8_t  a00_s8 = vld1_dup_s8(vec_a);
-            const int8x16_t b00_s8 = vld1q_s8(matrix_b);
-
-            const int16x4x4_t b00_s16 =
-            {
-                {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Accumulate 0:
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            vec_a += 1;
-            matrix_b += stride_b;
-        }
-
-        auto vec_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() < (width_out - 16))
-        {
-            vst1q_s32(vec_out + 0, c0.val[0]);
-            vst1q_s32(vec_out + 4, c0.val[1]);
-            vst1q_s32(vec_out + 8, c0.val[2]);
-            vst1q_s32(vec_out + 12, c0.val[3]);
-        }
-        else
-        {
-            auto left_over = width_out - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(vec_out + k * 4 + j) = c0.val[k][j];
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vec_out + k * 4 + j) = c0.val[k][j];
+                    }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void inline matrix_multiply_u8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+void inline matrix_multiply_u8(
+    Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
 {
     const auto   width_out  = static_cast<int>(out_info.dimension(0));
     const auto   height_out = static_cast<int>(out_info.dimension(1));
     const size_t out_stride = out_info.strides_in_bytes()[1] / out_info.element_size();
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *mtx_a0 = ina.ptr();
-        const uint8_t *mtx_b0 = inb.ptr();
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        uint32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
+            const uint8_t *mtx_a0 = ina.ptr();
+            const uint8_t *mtx_b0 = inb.ptr();
+
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            uint32x4x4_t c0 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 1
+            uint32x4x4_t c1 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 2
+            uint32x4x4_t c2 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            // Accumulators for the block 3
+            uint32x4x4_t c3 = {{vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0), vdupq_n_u32(0)}};
+
+            for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
             {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
+                const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
+                const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
+
+                // Convert a00_u8 to uint16_t and get the lower part
+                const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
+
+                // Convert b00_s8 to uint16_t
+                const uint16x4x4_t b00_u16 = {
+                    {vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
+                     vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))), vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))}};
+
+                // 4x4 block 0
+                c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
+                c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
+                c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
+                c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
+
+                // 4x4 block 1
+                c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
+                c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
+                c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
+                c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
+
+                // 4x4 block 2
+                c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
+                c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
+                c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
+                c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
+
+                // 4x4 block 3
+                c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
+                c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
+                c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
+                c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
             }
-        };
 
-        // Accumulators for the block 1
-        uint32x4x4_t c1 =
-        {
+            auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
+
+            if (id.y() < height_out && id.x() < (width_out - 16))
             {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 2
-        uint32x4x4_t c2 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        uint32x4x4_t c3 =
-        {
-            {
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0),
-                vdupq_n_u32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const uint8x8_t  a00_u8 = vld1_u8(mtx_a0);
-            const uint8x16_t b00_u8 = vld1q_u8(mtx_b0);
-
-            // Convert a00_u8 to uint16_t and get the lower part
-            const uint16x4_t a00_u16 = vget_low_u16(vmovl_u8(a00_u8));
-
-            // Convert b00_s8 to uint16_t
-            const uint16x4x4_t b00_u16 =
-            {
+                vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
+                vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
+                vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
+                vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
+                if (id.y() + 1 < height_out)
                 {
-                    vget_low_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_low_u8(b00_u8))),
-                    vget_low_u16(vmovl_u8(vget_high_u8(b00_u8))),
-                    vget_high_u16(vmovl_u8(vget_high_u8(b00_u8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_u16(c0.val[0], b00_u16.val[0], a00_u16, 0);
-            c0.val[1] = vmlal_lane_u16(c0.val[1], b00_u16.val[1], a00_u16, 0);
-            c0.val[2] = vmlal_lane_u16(c0.val[2], b00_u16.val[2], a00_u16, 0);
-            c0.val[3] = vmlal_lane_u16(c0.val[3], b00_u16.val[3], a00_u16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_u16(c1.val[0], b00_u16.val[0], a00_u16, 1);
-            c1.val[1] = vmlal_lane_u16(c1.val[1], b00_u16.val[1], a00_u16, 1);
-            c1.val[2] = vmlal_lane_u16(c1.val[2], b00_u16.val[2], a00_u16, 1);
-            c1.val[3] = vmlal_lane_u16(c1.val[3], b00_u16.val[3], a00_u16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_u16(c2.val[0], b00_u16.val[0], a00_u16, 2);
-            c2.val[1] = vmlal_lane_u16(c2.val[1], b00_u16.val[1], a00_u16, 2);
-            c2.val[2] = vmlal_lane_u16(c2.val[2], b00_u16.val[2], a00_u16, 2);
-            c2.val[3] = vmlal_lane_u16(c2.val[3], b00_u16.val[3], a00_u16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_u16(c3.val[0], b00_u16.val[0], a00_u16, 3);
-            c3.val[1] = vmlal_lane_u16(c3.val[1], b00_u16.val[1], a00_u16, 3);
-            c3.val[2] = vmlal_lane_u16(c3.val[2], b00_u16.val[2], a00_u16, 3);
-            c3.val[3] = vmlal_lane_u16(c3.val[3], b00_u16.val[3], a00_u16, 3);
-        }
-
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, vreinterpretq_s32_u32(c0.val[0]));
-            vst1q_s32(mtx_out + 0 * out_stride + 4, vreinterpretq_s32_u32(c0.val[1]));
-            vst1q_s32(mtx_out + 0 * out_stride + 8, vreinterpretq_s32_u32(c0.val[2]));
-            vst1q_s32(mtx_out + 0 * out_stride + 12, vreinterpretq_s32_u32(c0.val[3]));
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
-                vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
-                vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
-                vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
-                    if(id.y() + 3 < height_out)
+                    vst1q_s32(mtx_out + 1 * out_stride + 0, vreinterpretq_s32_u32(c1.val[0]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 4, vreinterpretq_s32_u32(c1.val[1]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 8, vreinterpretq_s32_u32(c1.val[2]));
+                    vst1q_s32(mtx_out + 1 * out_stride + 12, vreinterpretq_s32_u32(c1.val[3]));
+                    if (id.y() + 2 < height_out)
                     {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
-                    }
-                }
-            }
-        }
-        else
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
-            {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
-                {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                    {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
-                    }
-                }
-                if(id.y() + 2 < height_out)
-                {
-                    left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
-                    {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        vst1q_s32(mtx_out + 2 * out_stride + 0, vreinterpretq_s32_u32(c2.val[0]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 4, vreinterpretq_s32_u32(c2.val[1]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 8, vreinterpretq_s32_u32(c2.val[2]));
+                        vst1q_s32(mtx_out + 2 * out_stride + 12, vreinterpretq_s32_u32(c2.val[3]));
+                        if (id.y() + 3 < height_out)
                         {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            vst1q_s32(mtx_out + 3 * out_stride + 0, vreinterpretq_s32_u32(c3.val[0]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 4, vreinterpretq_s32_u32(c3.val[1]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 8, vreinterpretq_s32_u32(c3.val[2]));
+                            vst1q_s32(mtx_out + 3 * out_stride + 12, vreinterpretq_s32_u32(c3.val[3]));
                         }
                     }
-                    if(id.y() + 3 < height_out)
+                }
+            }
+            else
+            {
+                const auto left_over_value = width_out - id.x();
+                auto       left_over       = left_over_value;
+                for (auto k = 0; k < 4 && left_over; ++k)
+                {
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(mtx_out + k * 4 + j) = c0.val[k][j];
+                    }
+                }
+                if (id.y() + 1 < height_out)
+                {
+                    left_over = left_over_value;
+                    for (auto k = 0; k < 4 && left_over; ++k)
+                    {
+                        for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        {
+                            *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                        }
+                    }
+                    if (id.y() + 2 < height_out)
                     {
                         left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
+                        for (auto k = 0; k < 4 && left_over; ++k)
                         {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                             {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            }
+                        }
+                        if (id.y() + 3 < height_out)
+                        {
+                            left_over = left_over_value;
+                            for (auto k = 0; k < 4 && left_over; ++k)
+                            {
+                                for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                                {
+                                    *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                }
                             }
                         }
                     }
                 }
             }
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void inline matrix_multiply_s8(Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
+void inline matrix_multiply_s8(
+    Iterator &ina, Iterator &inb, Iterator &out, int width_b, const TensorInfo &out_info, const Window &window)
 {
     const auto   width_out  = static_cast<int>(out_info.dimension(0));
     const auto   height_out = static_cast<int>(out_info.dimension(1));
@@ -691,182 +540,148 @@
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
-        auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
-
-        // Note: Since the input are all positives, we can use uint32_t
-        // Accumulators for the block 0
-        int32x4x4_t c0 =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
+            auto *mtx_a0 = reinterpret_cast<const int8_t *>(ina.ptr());
+            auto *mtx_b0 = reinterpret_cast<const int8_t *>(inb.ptr());
+
+            // Note: Since the input are all positives, we can use uint32_t
+            // Accumulators for the block 0
+            int32x4x4_t c0 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 1
+            int32x4x4_t c1 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 2
+            int32x4x4_t c2 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            // Accumulators for the block 3
+            int32x4x4_t c3 = {{vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0), vdupq_n_s32(0)}};
+
+            for (int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
+                const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
+                const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
+
+                // Convert a00_s8 to uint16_t and get the lower part
+                const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
+
+                // Convert b00_s8 to int16_t
+                const int16x4x4_t b00_s16 = {
+                    {vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
+                     vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))), vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))}};
+
+                // 4x4 block 0
+                c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
+                c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
+                c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
+                c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
+
+                // 4x4 block 1
+                c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
+                c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
+                c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
+                c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
+
+                // 4x4 block 2
+                c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
+                c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
+                c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
+                c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
+
+                // 4x4 block 3
+                c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
+                c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
+                c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
+                c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
             }
-        };
-
-        // Accumulators for the block 1
-        int32x4x4_t c1 =
-        {
+            auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.y() < height_out && id.x() < (width_out - 16))
             {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 2
-        int32x4x4_t c2 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        // Accumulators for the block 3
-        int32x4x4_t c3 =
-        {
-            {
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0),
-                vdupq_n_s32(0)
-            }
-        };
-
-        for(int k = 0; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16)
-        {
-            const int8x8_t  a00_s8 = vld1_s8(mtx_a0);
-            const int8x16_t b00_s8 = vld1q_s8(mtx_b0);
-
-            // Convert a00_s8 to uint16_t and get the lower part
-            const int16x4_t a00_s16 = vget_low_s16(vmovl_s8(a00_s8));
-
-            // Convert b00_s8 to int16_t
-            const int16x4x4_t b00_s16 =
-            {
+                vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
+                vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
+                vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
+                vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
+                if (id.y() + 1 < height_out)
                 {
-                    vget_low_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_low_s8(b00_s8))),
-                    vget_low_s16(vmovl_s8(vget_high_s8(b00_s8))),
-                    vget_high_s16(vmovl_s8(vget_high_s8(b00_s8)))
-                }
-            };
-
-            // 4x4 block 0
-            c0.val[0] = vmlal_lane_s16(c0.val[0], b00_s16.val[0], a00_s16, 0);
-            c0.val[1] = vmlal_lane_s16(c0.val[1], b00_s16.val[1], a00_s16, 0);
-            c0.val[2] = vmlal_lane_s16(c0.val[2], b00_s16.val[2], a00_s16, 0);
-            c0.val[3] = vmlal_lane_s16(c0.val[3], b00_s16.val[3], a00_s16, 0);
-
-            // 4x4 block 1
-            c1.val[0] = vmlal_lane_s16(c1.val[0], b00_s16.val[0], a00_s16, 1);
-            c1.val[1] = vmlal_lane_s16(c1.val[1], b00_s16.val[1], a00_s16, 1);
-            c1.val[2] = vmlal_lane_s16(c1.val[2], b00_s16.val[2], a00_s16, 1);
-            c1.val[3] = vmlal_lane_s16(c1.val[3], b00_s16.val[3], a00_s16, 1);
-
-            // 4x4 block 2
-            c2.val[0] = vmlal_lane_s16(c2.val[0], b00_s16.val[0], a00_s16, 2);
-            c2.val[1] = vmlal_lane_s16(c2.val[1], b00_s16.val[1], a00_s16, 2);
-            c2.val[2] = vmlal_lane_s16(c2.val[2], b00_s16.val[2], a00_s16, 2);
-            c2.val[3] = vmlal_lane_s16(c2.val[3], b00_s16.val[3], a00_s16, 2);
-
-            // 4x4 block 3
-            c3.val[0] = vmlal_lane_s16(c3.val[0], b00_s16.val[0], a00_s16, 3);
-            c3.val[1] = vmlal_lane_s16(c3.val[1], b00_s16.val[1], a00_s16, 3);
-            c3.val[2] = vmlal_lane_s16(c3.val[2], b00_s16.val[2], a00_s16, 3);
-            c3.val[3] = vmlal_lane_s16(c3.val[3], b00_s16.val[3], a00_s16, 3);
-        }
-        auto mtx_out = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.y() < height_out && id.x() < (width_out - 16))
-        {
-            vst1q_s32(mtx_out + 0 * out_stride + 0, c0.val[0]);
-            vst1q_s32(mtx_out + 0 * out_stride + 4, c0.val[1]);
-            vst1q_s32(mtx_out + 0 * out_stride + 8, c0.val[2]);
-            vst1q_s32(mtx_out + 0 * out_stride + 12, c0.val[3]);
-            if(id.y() + 1 < height_out)
-            {
-                vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
-                vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
-                vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
-                vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
-                if(id.y() + 2 < height_out)
-                {
-                    vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
-                    vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
-                    if(id.y() + 3 < height_out)
+                    vst1q_s32(mtx_out + 1 * out_stride + 0, c1.val[0]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 4, c1.val[1]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 8, c1.val[2]);
+                    vst1q_s32(mtx_out + 1 * out_stride + 12, c1.val[3]);
+                    if (id.y() + 2 < height_out)
                     {
-                        vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
-                        vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
-                    }
-                }
-            }
-        }
-        else if(id.y() < height_out)
-        {
-            const auto left_over_value = width_out - id.x();
-            auto       left_over       = left_over_value;
-            for(auto k = 0; k < 4 && left_over; ++k)
-            {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                {
-                    *(mtx_out + k * 4 + j) = c0.val[k][j];
-                }
-            }
-            if(id.y() + 1 < height_out)
-            {
-                left_over = left_over_value;
-                for(auto k = 0; k < 4 && left_over; ++k)
-                {
-                    for(auto j = 0; j < 4 && left_over; ++j, --left_over)
-                    {
-                        *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
-                    }
-                }
-                if(id.y() + 2 < height_out)
-                {
-                    left_over = left_over_value;
-                    for(auto k = 0; k < 4 && left_over; ++k)
-                    {
-                        for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        vst1q_s32(mtx_out + 2 * out_stride + 0, c2.val[0]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 4, c2.val[1]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 8, c2.val[2]);
+                        vst1q_s32(mtx_out + 2 * out_stride + 12, c2.val[3]);
+                        if (id.y() + 3 < height_out)
                         {
-                            *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            vst1q_s32(mtx_out + 3 * out_stride + 0, c3.val[0]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 4, c3.val[1]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 8, c3.val[2]);
+                            vst1q_s32(mtx_out + 3 * out_stride + 12, c3.val[3]);
                         }
                     }
-                    if(id.y() + 3 < height_out)
+                }
+            }
+            else if (id.y() < height_out)
+            {
+                const auto left_over_value = width_out - id.x();
+                auto       left_over       = left_over_value;
+                for (auto k = 0; k < 4 && left_over; ++k)
+                {
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(mtx_out + k * 4 + j) = c0.val[k][j];
+                    }
+                }
+                if (id.y() + 1 < height_out)
+                {
+                    left_over = left_over_value;
+                    for (auto k = 0; k < 4 && left_over; ++k)
+                    {
+                        for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                        {
+                            *(mtx_out + out_stride + k * 4 + j) = c1.val[k][j];
+                        }
+                    }
+                    if (id.y() + 2 < height_out)
                     {
                         left_over = left_over_value;
-                        for(auto k = 0; k < 4 && left_over; ++k)
+                        for (auto k = 0; k < 4 && left_over; ++k)
                         {
-                            for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                            for (auto j = 0; j < 4 && left_over; ++j, --left_over)
                             {
-                                *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                *(mtx_out + out_stride * 2 + k * 4 + j) = c2.val[k][j];
+                            }
+                        }
+                        if (id.y() + 3 < height_out)
+                        {
+                            left_over = left_over_value;
+                            for (auto k = 0; k < 4 && left_over; ++k)
+                            {
+                                for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                                {
+                                    *(mtx_out + out_stride * 3 + k * 4 + j) = c3.val[k][j];
+                                }
                             }
                         }
                     }
                 }
             }
-        }
-
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
 Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8, DataType::U8);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S8, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+                                                         DataType::U8);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
 
     TensorShape in0_shape = src0->tensor_shape();
@@ -874,9 +689,10 @@
     TensorShape out_shape = dst->tensor_shape();
 
     // Check vector-by-matrix case
-    if(out_shape[1] == 1)
+    if (out_shape[1] == 1)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1], "The number of input0's columns must be equal to input1's rows");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[0] != in1_shape[1],
+                                        "The number of input0's columns must be equal to input1's rows");
     }
     else
     {
@@ -884,8 +700,11 @@
         in1_shape.collapse(2);
         out_shape.collapse(2);
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2], "Output tensor must have the same number of batches of input0 tensor");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[2] != 1 && in0_shape[2] != in1_shape[2], "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(in0_shape[2] != out_shape[2],
+                                        "Output tensor must have the same number of batches of input0 tensor");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            in1_shape[2] != 1 && in0_shape[2] != in1_shape[2],
+            "Input1 tensor must have the same number of batches of input0 or the number of batches must be set to 1");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_shape[0] % 16, "Input1's width must be a multiple of 16");
     }
 
@@ -909,20 +728,22 @@
 
     Window win;
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication
-    if((dst->dimension(1) == 1))
+    if ((dst->dimension(1) == 1))
     {
         // Configure kernel window
         win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x));
     }
     else
     {
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        win =
+            calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     }
 
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status
+CpuGemmLowpMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst));
     return Status{};
@@ -939,12 +760,13 @@
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
 
     // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication path
-    if((dst->info()->dimension(1) == 1))
+    if ((dst->info()->dimension(1) == 1))
     {
         const auto width_matrix_a = static_cast<int>(src0->info()->dimension(0));
         const auto width_matrix_b = static_cast<int>(src1->info()->dimension(0));
         const auto width_out      = static_cast<int>(dst->info()->dimension(0));
-        const auto in_b_stride    = static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
+        const auto in_b_stride =
+            static_cast<int>(src1->info()->strides_in_bytes()[1] / data_size_from_type(src1->info()->data_type()));
 
         // The implementation computes 16 elements per iteration
         const int window_start_x = 16 * info.thread_id;
@@ -963,7 +785,7 @@
         Window win_b;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(src1->info()->num_dimensions() >= 3)
+        if (src1->info()->num_dimensions() >= 3)
         {
             win_b = window;
         }
@@ -974,18 +796,20 @@
         Iterator inb(src1, win_b);
         Iterator out(dst, win_out);
 
-        switch(src0->info()->data_type())
+        switch (src0->info()->data_type())
         {
             case DataType::S8:
             case DataType::QASYMM8_SIGNED:
             {
-                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
+                vector_matrix_multiply_s8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+                                          window);
                 break;
             }
             case DataType::U8:
             case DataType::QASYMM8:
             {
-                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride, window);
+                vector_matrix_multiply_u8(ina, inb, out, width_matrix_a, width_matrix_b, width_out, in_b_stride,
+                                          window);
                 break;
             }
             default:
@@ -1009,7 +833,7 @@
         Window win_b;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-        if(_slide_matrix_b)
+        if (_slide_matrix_b)
         {
             win_b = window;
         }
@@ -1021,7 +845,7 @@
         Iterator inb(src1, win_b);
         Iterator out(dst, window);
 
-        switch(src0->info()->data_type())
+        switch (src0->info()->data_type())
         {
             case DataType::S8:
             case DataType::QASYMM8_SIGNED:
@@ -1050,4 +874,4 @@
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
index 2cc789d..439ada1 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h

@@ -68,11 +68,11 @@
     static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    bool _slide_matrix_b{ true };
+    bool _slide_matrix_b{true};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
index 534076b..9bd1eae 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.cpp

@@ -26,9 +26,10 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -38,37 +39,49 @@
 {
 namespace
 {
-Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status validate_arguments_matrix_a_reduction(const ITensorInfo                 *src,
+                                             const ITensorInfo                 *dst,
+                                             const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(1),
+            "Output vector must have length equal to the number of rows of the input matrix");
     }
     return Status{};
 }
-Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status validate_arguments_matrix_b_reduction(const ITensorInfo                 *src,
+                                             const ITensorInfo                 *dst,
+                                             const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_ON_MSG(info.is_reshaped == true, "Not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(0),
+            "Output vector must have length equal to the number of columns of the input matrix");
     }
     return Status{};
 }
 } // namespace
 
-void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+void CpuGemmLowpMatrixAReductionKernel::configure(const ITensorInfo                 *src,
+                                                  ITensorInfo                       *dst,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -77,7 +90,7 @@
     _scalar        = info.scalar;
     _mul_by_scalar = info.mul_by_scalar;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
             _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
@@ -98,14 +111,18 @@
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixAReductionKernel::validate(const ITensorInfo                 *src,
+                                                   const ITensorInfo                 *dst,
+                                                   const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(src, dst, info));
     return Status{};
 }
 
 template <typename T>
-void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor *src, ITensor *dst, const arm_compute::Window &window)
+void CpuGemmLowpMatrixAReductionKernel::run_internal(const ITensor             *src,
+                                                     ITensor                   *dst,
+                                                     const arm_compute::Window &window)
 {
     // Intermediate and final accumulator types
     using TIAcc = wrapper::traits::promote_t<T>;
@@ -121,55 +138,58 @@
     Iterator in(src, win_input);
     Iterator out(dst, collapsed_window);
 
-    execute_window_loop(collapsed_window, [&](const Coordinates & id)
-    {
-        auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
-        TAcc sum_row  = 0;
+    execute_window_loop(
+        collapsed_window,
+        [&](const Coordinates &id)
+        {
+            auto vsum_row = wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{});
+            TAcc sum_row  = 0;
 
-        const T *matrix_a = reinterpret_cast<const T *>((in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
+            const T *matrix_a = reinterpret_cast<const T *>(
+                (in.ptr() + id.x() * src->info()->strides_in_bytes()[1] + id.y() * src->info()->strides_in_bytes()[2]));
 
 #if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_a));
 #endif /* __arm__ */
 
-        int i = 0;
-        // This for loop performs 16 accumulations
-        for(; i <= (_k - 16); i += 16)
-        {
-            const auto a0_d8 = wrapper::vloadq(matrix_a + i);
+            int i = 0;
+            // This for loop performs 16 accumulations
+            for (; i <= (_k - 16); i += 16)
+            {
+                const auto a0_d8 = wrapper::vloadq(matrix_a + i);
 
-            // Partial accumulations in U16
-            const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
+                // Partial accumulations in U16
+                const auto tmp_sum0 = wrapper::vaddl(wrapper::vgetlow(a0_d8), wrapper::vgethigh(a0_d8));
 
-            // Accumulate to U32
-            vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
-        }
+                // Accumulate to U32
+                vsum_row = wrapper::vadd(vsum_row, wrapper::vpaddl(tmp_sum0));
+            }
 
-        // This for loop performs the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            sum_row += static_cast<TAcc>(matrix_a[i]);
-        }
+            // This for loop performs the leftover accumulations
+            for (; i < _k; ++i)
+            {
+                sum_row += static_cast<TAcc>(matrix_a[i]);
+            }
 
 #if defined(__aarch64__)
-        // Reduction operation available on 64 bit architectures only
-        sum_row += wrapper::vaddv(vsum_row);
+            // Reduction operation available on 64 bit architectures only
+            sum_row += wrapper::vaddv(vsum_row);
 #else  // __aarch64__
-        auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
-        tmp      = wrapper::vpadd(tmp, tmp);
+            auto tmp = wrapper::vpadd(wrapper::vgethigh(vsum_row), wrapper::vgetlow(vsum_row));
+            tmp      = wrapper::vpadd(tmp, tmp);
 
-        sum_row += wrapper::vgetlane(tmp, 0);
+            sum_row += wrapper::vgetlane(tmp, 0);
 #endif // __aarch64__
 
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_row *= _scalar;
-        }
+            // Multiply by scalar if necessary
+            if (_mul_by_scalar)
+            {
+                sum_row *= _scalar;
+            }
 
-        *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
-    },
-    in, out);
+            *(reinterpret_cast<int *>(out.ptr())) = static_cast<int32_t>(sum_row);
+        },
+        in, out);
 }
 
 void CpuGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -189,7 +209,9 @@
     return "CpuGemmLowpMatrixAReductionKernel";
 }
 
-void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::configure(const ITensorInfo                 *src,
+                                                  ITensorInfo                       *dst,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(src, dst, info));
@@ -201,7 +223,7 @@
     // Configure kernel window
     constexpr unsigned int num_elems_processed_per_iteration = 16;
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
             _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
@@ -223,14 +245,19 @@
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
+Status CpuGemmLowpMatrixBReductionKernel::validate(const ITensorInfo                 *src,
+                                                   const ITensorInfo                 *dst,
+                                                   const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(src, dst, info));
     return Status{};
 }
 
 template <typename T>
-void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpMatrixBReductionKernel::run_internal(const ITensor    *src,
+                                                     ITensor          *dst,
+                                                     const Window     &window,
+                                                     const ThreadInfo &info)
 {
     // Intermediate and final accumulator types
     using TIAcc = wrapper::traits::promote_t<T>;
@@ -258,121 +285,116 @@
     Iterator inb(src, win_in);
     Iterator out(dst, win_out);
 
-    execute_window_loop(win_out, [&](const Coordinates & id)
-    {
-        if(id.x() > width_matrix_b)
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &id)
         {
-            return;
-        }
+            if (id.x() > width_matrix_b)
+            {
+                return;
+            }
 
-        // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
-        typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] =
-        {
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
-            wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})
-        };
+            // Note: Since the input is unsigned char, we can safely use unsigned int for the accumulation
+            typename wrapper::traits::neon_bitvector<TAcc, wrapper::traits::BitWidth::W128>::type sum_col[4] = {
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{}),
+                wrapper::vdup_n(static_cast<TAcc>(0), wrapper::traits::vector_128_tag{})};
 
-        const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
+            const auto *matrix_b = reinterpret_cast<const T *>(inb.ptr() + id.y() * src->info()->strides_in_bytes()[2]);
 
 #if __arm__
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
-        asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b));
+            asm volatile("PLD [%0, #128*4]" ::"r"(matrix_b + in_b_stride));
 #endif /* __arm__ */
 
-        int i = 0;
-        // This for loop performs 4 accumulations
-        for(; i <= (_k - 4); i += 4)
-        {
-            const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-            const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
-            const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
-            const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
+            int i = 0;
+            // This for loop performs 4 accumulations
+            for (; i <= (_k - 4); i += 4)
+            {
+                const auto b0_u8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
+                const auto b1_u8 = wrapper::vloadq(matrix_b + 1 * in_b_stride);
+                const auto b2_u8 = wrapper::vloadq(matrix_b + 2 * in_b_stride);
+                const auto b3_u8 = wrapper::vloadq(matrix_b + 3 * in_b_stride);
 
 #if __arm__
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
-            asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 1 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 2 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 3 * in_b_stride));
+                asm volatile("PLD [%0, #128*1]" ::"r"(matrix_b + 4 * in_b_stride));
 #endif /* __arm__ */
 
-            // Partial accumulation in 16bit
-            typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] =
+                // Partial accumulation in 16bit
+                typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type tmp_sum[2] = {
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
+                    wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})};
+
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
+                tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
+                tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
+
+                matrix_b += 4 * in_b_stride;
+            }
+
+            // This for loop perfoms the leftover accumulations
+            for (; i < _k; ++i)
             {
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{}),
-                wrapper::vdup_n(static_cast<TIAcc>(0), wrapper::traits::vector_128_tag{})
-            };
+                const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
 
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b1_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b0_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b2_u8));
-            tmp_sum[0] = wrapper::vaddw(tmp_sum[0], wrapper::vgetlow(b3_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b0_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b1_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b2_u8));
-            tmp_sum[1] = wrapper::vaddw(tmp_sum[1], wrapper::vgethigh(b3_u8));
+                // Convert S8 to S16
+                const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]{
+                    wrapper::vmovl(wrapper::vgetlow(b0_b8)), wrapper::vmovl(wrapper::vgethigh(b0_b8))};
 
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(tmp_sum[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(tmp_sum[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(tmp_sum[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(tmp_sum[1]));
+                // Accumulate to 32bit
+                sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
+                sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
+                sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
+                sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
 
-            matrix_b += 4 * in_b_stride;
-        }
+                matrix_b += in_b_stride;
+            }
 
-        // This for loop perfoms the leftover accumulations
-        for(; i < _k; ++i)
-        {
-            const auto b0_b8 = wrapper::vloadq(matrix_b + 0 * in_b_stride);
-
-            // Convert S8 to S16
-            const typename wrapper::traits::neon_bitvector<TIAcc, wrapper::traits::BitWidth::W128>::type b0_b16[2]
+            // Multiply by scalar if necessary
+            if (_mul_by_scalar)
             {
-                wrapper::vmovl(wrapper::vgetlow(b0_b8)),
-                wrapper::vmovl(wrapper::vgethigh(b0_b8))
-            };
+                sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
+                sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
+                sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
+                sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
+            }
 
-            // Accumulate to 32bit
-            sum_col[0] = wrapper::vaddw(sum_col[0], wrapper::vgetlow(b0_b16[0]));
-            sum_col[1] = wrapper::vaddw(sum_col[1], wrapper::vgethigh(b0_b16[0]));
-            sum_col[2] = wrapper::vaddw(sum_col[2], wrapper::vgetlow(b0_b16[1]));
-            sum_col[3] = wrapper::vaddw(sum_col[3], wrapper::vgethigh(b0_b16[1]));
-
-            matrix_b += in_b_stride;
-        }
-
-        // Multiply by scalar if necessary
-        if(_mul_by_scalar)
-        {
-            sum_col[0] = wrapper::vmul(sum_col[0], vec_scalar);
-            sum_col[1] = wrapper::vmul(sum_col[1], vec_scalar);
-            sum_col[2] = wrapper::vmul(sum_col[2], vec_scalar);
-            sum_col[3] = wrapper::vmul(sum_col[3], vec_scalar);
-        }
-
-        auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
-        if(id.x() + 16 < width_matrix_b)
-        {
-            wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
-            wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
-            wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
-            wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
-        }
-        else
-        {
-            auto left_over = width_matrix_b - id.x();
-            for(auto k = 0; k < 4 && left_over; ++k)
+            auto vector_sum_col = reinterpret_cast<int32_t *>(out.ptr());
+            if (id.x() + 16 < width_matrix_b)
             {
-                for(auto j = 0; j < 4 && left_over; ++j, --left_over)
+                wrapper::vstore(vector_sum_col + 0, wrapper::vreinterpret(sum_col[0]));
+                wrapper::vstore(vector_sum_col + 4, wrapper::vreinterpret(sum_col[1]));
+                wrapper::vstore(vector_sum_col + 8, wrapper::vreinterpret(sum_col[2]));
+                wrapper::vstore(vector_sum_col + 12, wrapper::vreinterpret(sum_col[3]));
+            }
+            else
+            {
+                auto left_over = width_matrix_b - id.x();
+                for (auto k = 0; k < 4 && left_over; ++k)
                 {
-                    *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+                    for (auto j = 0; j < 4 && left_over; ++j, --left_over)
+                    {
+                        *(vector_sum_col + k * 4 + j) = sum_col[k][j];
+                    }
                 }
             }
-        }
-    },
-    inb, out);
+        },
+        inb, out);
 }
 
 void CpuGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
@@ -393,4 +415,4 @@
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
index e469629..20ef17e 100644
--- a/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h

@@ -66,7 +66,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -85,12 +85,14 @@
      * @param[out] dst    Output tensor
      * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+    using CpuGemmLowpMatrixAReductionKernelPtr = void (CpuGemmLowpMatrixAReductionKernel::*)(const ITensor *src,
+                                                                                             ITensor       *dst,
+                                                                                             const Window  &window);
 
-    CpuGemmLowpMatrixAReductionKernelPtr _func{ nullptr };
-    int32_t                              _k{ 0 };
-    int32_t                              _scalar{ 0 };
-    bool                                 _mul_by_scalar{ false };
+    CpuGemmLowpMatrixAReductionKernelPtr _func{nullptr};
+    int32_t                              _k{0};
+    int32_t                              _scalar{0};
+    bool                                 _mul_by_scalar{false};
 };
 
 /** Kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B.
@@ -124,7 +126,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -144,12 +146,15 @@
      * @param[out] dst    Output tensor
      * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
-    using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor *src, ITensor *dst, const Window &window, const ThreadInfo &info);
+    using CpuGemmLowpMatrixBReductionKernelPtr = void (CpuGemmLowpMatrixBReductionKernel::*)(const ITensor    *src,
+                                                                                             ITensor          *dst,
+                                                                                             const Window     &window,
+                                                                                             const ThreadInfo &info);
 
-    CpuGemmLowpMatrixBReductionKernelPtr _func{ nullptr };
-    int32_t                              _k{ 0 };
-    int32_t                              _scalar{ 0 };
-    bool                                 _mul_by_scalar{ false };
+    CpuGemmLowpMatrixBReductionKernelPtr _func{nullptr};
+    int32_t                              _k{0};
+    int32_t                              _scalar{0};
+    bool                                 _mul_by_scalar{false};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
index a65f1a3..e290783 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -44,32 +45,37 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                          int32_t a_offset, int32_t b_offset)
+Status validate_arguments(const ITensorInfo *mm_result,
+                          const ITensorInfo *vector_sum_col,
+                          const ITensorInfo *vector_sum_row,
+                          int32_t            a_offset,
+                          int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -80,13 +86,15 @@
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
@@ -94,9 +102,15 @@
     return Status{};
 }
 
-void run_offset_contribution(const Window &window,
-                             ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row,
-                             int32_t a_offset, int32_t b_offset, int32_t k_offset, bool slide_vector_sum_col, bool is_gemm3d)
+void run_offset_contribution(const Window  &window,
+                             ITensor       *mm_result,
+                             const ITensor *vector_sum_col,
+                             const ITensor *vector_sum_row,
+                             int32_t        a_offset,
+                             int32_t        b_offset,
+                             int32_t        k_offset,
+                             bool           slide_vector_sum_col,
+                             bool           is_gemm3d)
 {
     Window collapsed_window = window.collapse_if_possible(window, Window::DimZ);
     collapsed_window.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -112,7 +126,7 @@
     const size_t sum_col_stride_y = (vector_sum_col != nullptr) ? (vector_sum_col->info()->strides_in_bytes().y()) : 0;
     Iterator     mm_result_it(mm_result, collapsed_window);
 
-    if((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
+    if ((a_offset != 0) && (b_offset != 0) && (vector_sum_col != nullptr) && (vector_sum_row != nullptr)) // true, true
     {
         // Set window for vector_sum_col
         Window win_vector_sum_col(collapsed_window);
@@ -131,95 +145,85 @@
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
         // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
 
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int    batch_id           = id.z() / depth_input;
-            const size_t batch_offset_col   = batch_id * (sum_col_stride_y );
-            auto         vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset);
-            auto         mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
-
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
             {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
+                const int    batch_id         = id.z() / depth_input;
+                const size_t batch_offset_col = batch_id * (sum_col_stride_y);
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                // Compute the leftover term due to b_offset.
+                int32_t b_offset_term_s32 =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                b_offset_term_s32 *= b_offset;
+
+                const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
 
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
 
-                // Add a_offset_term_s32 and b_offset_term_s32
-                int32x4x4_t offset_term_s32 =
+                    // Add a_offset_term_s32 and b_offset_term_s32
+                    int32x4x4_t offset_term_s32 = {
+                        {vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
+
+                    offset_term_s32.val[0] =
+                        vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
+                    offset_term_s32.val[1] =
+                        vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
+                    offset_term_s32.val[2] =
+                        vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
+                    offset_term_s32.val[3] =
+                        vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
+
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset),
-                        vdupq_n_s32(k_offset)
-                    }
-                };
+                    // Compute the leftover term due to a_offset.
+                    int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
 
-                offset_term_s32.val[0] = vaddq_s32(offset_term_s32.val[0], vaddq_s32(a_offset_term_s32.val[0], b_offset_term_s32_vec));
-                offset_term_s32.val[1] = vaddq_s32(offset_term_s32.val[1], vaddq_s32(a_offset_term_s32.val[1], b_offset_term_s32_vec));
-                offset_term_s32.val[2] = vaddq_s32(offset_term_s32.val[2], vaddq_s32(a_offset_term_s32.val[2], b_offset_term_s32_vec));
-                offset_term_s32.val[3] = vaddq_s32(offset_term_s32.val[3], vaddq_s32(a_offset_term_s32.val[3], b_offset_term_s32_vec));
+                    a_offset_term_s32 *= a_offset;
 
-                int32x4x4_t in_s32 =
-                {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
-
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                a_offset_term_s32 *= a_offset;
-
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
-            }
-        },
-        vector_sum_col_it, vector_sum_row_it, mm_result_it);
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += k_offset + a_offset_term_s32 + b_offset_term_s32;
+                }
+            },
+            vector_sum_col_it, vector_sum_row_it, mm_result_it);
     }
-    else if((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
+    else if ((a_offset == 0) && (b_offset != 0) && (vector_sum_row != nullptr)) // false, true
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
 
@@ -233,54 +237,51 @@
 
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int batch_id      = id.z() / depth_input;
-            auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            // Compute the leftover term due to b_offset.
-            int32_t b_offset_term_s32 = *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) + id.y() + (id.z() % depth_input) * height_input);
-            b_offset_term_s32 *= b_offset;
-
-            const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
             {
-                int32x4x4_t in_s32 =
+                const int batch_id      = id.z() / depth_input;
+                auto      mm_result_ptr = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                // Compute the leftover term due to b_offset.
+                int32_t b_offset_term_s32 =
+                    *(reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                      id.y() + (id.z() % depth_input) * height_input);
+                b_offset_term_s32 *= b_offset;
+
+                const int32x4_t b_offset_term_s32_vec = vdupq_n_s32(b_offset_term_s32);
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
 
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], b_offset_term_s32_vec);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], b_offset_term_s32_vec);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], b_offset_term_s32_vec);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], b_offset_term_s32_vec);
 
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
 
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += b_offset_term_s32;
-            }
-        },
-        vector_sum_row_it, mm_result_it);
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
+                {
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += b_offset_term_s32;
+                }
+            },
+            vector_sum_row_it, mm_result_it);
     }
-    else if((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
+    else if ((a_offset != 0) && (b_offset == 0) && (vector_sum_col != nullptr)) // true, false
     {
         // Set window for vector_sum_col
         Window win_vector_sum_col(collapsed_window);
@@ -290,69 +291,62 @@
         Iterator vector_sum_col_it(vector_sum_col, win_vector_sum_col);
 
         // Offset in case vector_sum_col is batched
-        const int vector_sum_col_batch_offset = slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
+        const int vector_sum_col_batch_offset =
+            slide_vector_sum_col ? vector_sum_col->info()->strides_in_bytes().z() : 0;
 
-        execute_window_loop(collapsed_window, [&](const Coordinates & id)
-        {
-            const int    batch_id           = id.z() / depth_input;
-            const size_t batch_offset_col   = batch_id * (sum_col_stride_y ); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
-            auto         vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col + batch_id * vector_sum_col_batch_offset);
-            auto         mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
-
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            collapsed_window,
+            [&](const Coordinates &id)
             {
-                // Compute the leftover term due to a_offset.
-                int32x4x4_t a_offset_term_s32 =
+                const int    batch_id = id.z() / depth_input;
+                const size_t batch_offset_col =
+                    batch_id *
+                    (sum_col_stride_y); // Value to offset vector_sum_col_ptr to allow for iteration of y values in tensor
+                auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_offset_col +
+                                                                            batch_id * vector_sum_col_batch_offset);
+                auto mm_result_ptr      = reinterpret_cast<int32_t *>(mm_result_it.ptr());
+
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(vector_sum_col_ptr + x + 0),
-                        vld1q_s32(vector_sum_col_ptr + x + 4),
-                        vld1q_s32(vector_sum_col_ptr + x + 8),
-                        vld1q_s32(vector_sum_col_ptr + x + 12)
-                    }
-                };
+                    // Compute the leftover term due to a_offset.
+                    int32x4x4_t a_offset_term_s32 = {
+                        {vld1q_s32(vector_sum_col_ptr + x + 0), vld1q_s32(vector_sum_col_ptr + x + 4),
+                         vld1q_s32(vector_sum_col_ptr + x + 8), vld1q_s32(vector_sum_col_ptr + x + 12)}};
 
-                a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
-                a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
-                a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
-                a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
+                    a_offset_term_s32.val[0] = vmulq_n_s32(a_offset_term_s32.val[0], a_offset);
+                    a_offset_term_s32.val[1] = vmulq_n_s32(a_offset_term_s32.val[1], a_offset);
+                    a_offset_term_s32.val[2] = vmulq_n_s32(a_offset_term_s32.val[2], a_offset);
+                    a_offset_term_s32.val[3] = vmulq_n_s32(a_offset_term_s32.val[3], a_offset);
 
-                int32x4x4_t in_s32 =
+                    int32x4x4_t in_s32 = {{vld1q_s32(mm_result_ptr + x + 0), vld1q_s32(mm_result_ptr + x + 4),
+                                           vld1q_s32(mm_result_ptr + x + 8), vld1q_s32(mm_result_ptr + x + 12)}};
+
+                    // Add the offset terms to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
+
+                    // Store the result with the offset contribution
+                    vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
+                    vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
+                    vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
+                    vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
+                }
+
+                // Left-overs loop
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(mm_result_ptr + x + 0),
-                        vld1q_s32(mm_result_ptr + x + 4),
-                        vld1q_s32(mm_result_ptr + x + 8),
-                        vld1q_s32(mm_result_ptr + x + 12)
-                    }
-                };
+                    // Compute the leftover term due to a_offset.
+                    const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
 
-                // Add the offset terms to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], a_offset_term_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], a_offset_term_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], a_offset_term_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], a_offset_term_s32.val[3]);
-
-                // Store the result with the offset contribution
-                vst1q_s32(mm_result_ptr + x + 0, in_s32.val[0]);
-                vst1q_s32(mm_result_ptr + x + 4, in_s32.val[1]);
-                vst1q_s32(mm_result_ptr + x + 8, in_s32.val[2]);
-                vst1q_s32(mm_result_ptr + x + 12, in_s32.val[3]);
-            }
-
-            // Left-overs loop
-            for(; x < window_end_x; ++x)
-            {
-                // Compute the leftover term due to a_offset.
-                const int32_t a_offset_term_s32 = *(vector_sum_col_ptr + x);
-
-                // Add the offset terms to GEMM's result
-                // Store the result with the offset contribution
-                mm_result_ptr[x] += a_offset_term_s32 * a_offset;
-            }
-        },
-        vector_sum_col_it, mm_result_it);
+                    // Add the offset terms to GEMM's result
+                    // Store the result with the offset contribution
+                    mm_result_ptr[x] += a_offset_term_s32 * a_offset;
+                }
+            },
+            vector_sum_col_it, mm_result_it);
     }
     else // false, false
     {
@@ -362,7 +356,12 @@
 }
 } // namespace
 
-void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset)
+void CpuGemmLowpOffsetContributionKernel::configure(ITensorInfo *mm_result,
+                                                    ITensorInfo *vector_sum_col,
+                                                    ITensorInfo *vector_sum_row,
+                                                    int32_t      k,
+                                                    int32_t      a_offset,
+                                                    int32_t      b_offset)
 {
     // Perform validate step
     ARM_COMPUTE_UNUSED(vector_sum_row);
@@ -374,7 +373,7 @@
     _k_offset = a_offset * b_offset * k;
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         // Check if vector_sum_col_shape should be slidden or not
         // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
@@ -387,8 +386,11 @@
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row,
-                                                     int32_t a_offset, int32_t b_offset)
+Status CpuGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+                                                     const ITensorInfo *vector_sum_col,
+                                                     const ITensorInfo *vector_sum_row,
+                                                     int32_t            a_offset,
+                                                     int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, a_offset, b_offset));
     return Status{};
@@ -405,11 +407,11 @@
     auto mm_result      = tensors.get_tensor(TensorType::ACL_DST);
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+                                   mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
 
-    run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset, _slide_vector_sum_col, reinterpret_as_3d);
+    run_offset_contribution(window, mm_result, vector_sum_col, vector_sum_row, _a_offset, _b_offset, _k_offset,
+                            _slide_vector_sum_col, reinterpret_as_3d);
 }
 
 const char *CpuGemmLowpOffsetContributionKernel::name() const
@@ -418,4 +420,4 @@
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
index 3514ca8..08b2d47 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionKernel.h

@@ -63,24 +63,33 @@
      * @param[in]      a_offset       Offset to be added to each element of the matrix A.
      * @param[in]      b_offset       Offset to be added to each element of the matrix B.
      */
-    void configure(ITensorInfo *mm_result, ITensorInfo *vector_sum_col, ITensorInfo *vector_sum_row, int32_t k, int32_t a_offset, int32_t b_offset);
+    void configure(ITensorInfo *mm_result,
+                   ITensorInfo *vector_sum_col,
+                   ITensorInfo *vector_sum_row,
+                   int32_t      k,
+                   int32_t      a_offset,
+                   int32_t      b_offset);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpOffsetContributionKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, int32_t a_offset, int32_t b_offset);
+    static Status validate(const ITensorInfo *mm_result,
+                           const ITensorInfo *vector_sum_col,
+                           const ITensorInfo *vector_sum_row,
+                           int32_t            a_offset,
+                           int32_t            b_offset);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
-    int32_t _a_offset{ 0 };
-    int32_t _b_offset{ 0 };
-    int32_t _k_offset{ 0 };
-    bool    _slide_vector_sum_col{ true };
+    int32_t _a_offset{0};
+    int32_t _b_offset{0};
+    int32_t _k_offset{0};
+    bool    _slide_vector_sum_col{true};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
index 190487e..d008842 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.cpp

@@ -31,10 +31,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "src/core/NEON/NEAsymm.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -48,80 +49,38 @@
 {
 inline int32x4x4_t load_results_input(const Iterator &mm_result_it, int32_t x)
 {
-    return
-    {
-        {
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
-            vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)
-        }
-    };
+    return {{vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 0),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 4),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 8),
+             vld1q_s32(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x + 12)}};
 }
 
 inline int32x4x4_t load(const int32_t *ptr, int32_t x)
 {
-    return
-    {
-        {
-            vld1q_s32(ptr + x + 0),
-            vld1q_s32(ptr + x + 4),
-            vld1q_s32(ptr + x + 8),
-            vld1q_s32(ptr + x + 12)
-        }
-    };
+    return {{vld1q_s32(ptr + x + 0), vld1q_s32(ptr + x + 4), vld1q_s32(ptr + x + 8), vld1q_s32(ptr + x + 12)}};
 }
 
 inline int32x4x4_t add_s32(int32x4x4_t a, int32x4_t b)
 {
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b),
-            vaddq_s32(a.val[1], b),
-            vaddq_s32(a.val[2], b),
-            vaddq_s32(a.val[3], b)
-        }
-    };
+    return {{vaddq_s32(a.val[0], b), vaddq_s32(a.val[1], b), vaddq_s32(a.val[2], b), vaddq_s32(a.val[3], b)}};
 }
 
 inline int32x4x4_t add_s32(int32x4x4_t a, int32x4x4_t b)
 {
-    return
-    {
-        {
-            vaddq_s32(a.val[0], b.val[0]),
-            vaddq_s32(a.val[1], b.val[1]),
-            vaddq_s32(a.val[2], b.val[2]),
-            vaddq_s32(a.val[3], b.val[3])
-        }
-    };
+    return {{vaddq_s32(a.val[0], b.val[0]), vaddq_s32(a.val[1], b.val[1]), vaddq_s32(a.val[2], b.val[2]),
+             vaddq_s32(a.val[3], b.val[3])}};
 }
 
 inline int32x4x4_t mul_s32(int32x4x4_t &a, int32_t mul_scalar)
 {
-    return
-    {
-        {
-            vmulq_n_s32(a.val[0], mul_scalar),
-            vmulq_n_s32(a.val[1], mul_scalar),
-            vmulq_n_s32(a.val[2], mul_scalar),
-            vmulq_n_s32(a.val[3], mul_scalar)
-        }
-    };
+    return {{vmulq_n_s32(a.val[0], mul_scalar), vmulq_n_s32(a.val[1], mul_scalar), vmulq_n_s32(a.val[2], mul_scalar),
+             vmulq_n_s32(a.val[3], mul_scalar)}};
 }
 
 inline int32x4x4_t mul_s32(int32x4x4_t &a, const int32_t *multilpier)
 {
-    return
-    {
-        {
-            vmulq_s32(a.val[0], vld1q_s32(multilpier)),
-            vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
-            vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)),
-            vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))
-        }
-    };
+    return {{vmulq_s32(a.val[0], vld1q_s32(multilpier)), vmulq_s32(a.val[1], vld1q_s32(multilpier + 4)),
+             vmulq_s32(a.val[2], vld1q_s32(multilpier + 8)), vmulq_s32(a.val[3], vld1q_s32(multilpier + 12))}};
 }
 
 inline int32x4x4_t get_a_offset(const int32_t *vector_sum_col_ptr, int32_t a_offset, int32_t x)
@@ -144,18 +103,11 @@
 
 inline int32x4x4_t get_k_offset(int32_t k_offset)
 {
-    return
-    {
-        {
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset),
-            vdupq_n_s32(k_offset)
-        }
-    };
+    return {{vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset), vdupq_n_s32(k_offset)}};
 }
 
-inline uint8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
+inline uint8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4_t result_shift_s32, uint8x16_t min_u8, uint8x16_t max_u8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -172,18 +124,13 @@
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to U8
     uint8x16_t out_u8 = vcombine_u8(vqmovun_s16(in_s16.val[0]), vqmovun_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_u8 = vmaxq_u8(out_u8, min_u8);
         out_u8 = vminq_u8(out_u8, max_u8);
@@ -192,7 +139,8 @@
     return out_u8;
 }
 
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+inline int8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -209,18 +157,13 @@
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -229,7 +172,8 @@
     return out_s8;
 }
 
-inline int8x16_t finalize_quantization_floating_point(int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
+inline int8x16_t finalize_quantization_floating_point(
+    int32x4x4_t &in_s32, int32x4x4_t result_shift_s32, int8x16_t min_s8, int8x16_t max_s8, bool is_bounded_relu)
 {
     const static int32x4_t zero_s32 = vdupq_n_s32(0);
 
@@ -246,18 +190,13 @@
     in_s32.val[3] = vmaxq_s32(in_s32.val[3], zero_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8
     int8x16_t out_s8 = vcombine_s8(vqmovn_s16(in_s16.val[0]), vqmovn_s16(in_s16.val[1]));
 
-    if(is_bounded_relu)
+    if (is_bounded_relu)
     {
         out_s8 = vmaxq_s8(out_s8, min_s8);
         out_s8 = vminq_s8(out_s8, max_s8);
@@ -305,81 +244,103 @@
 }
 
 template <typename VT>
-inline void run_offset_contribution_output_stage_window(const int32_t *vector_sum_col_ptr, const int32_t *vector_sum_row_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                        const int32x4_t result_offset_s32, const int32x4_t result_shift_s32,
-                                                        typename VT::vtype min_vec, typename VT::vtype max_vec,
-                                                        int32_t a_offset, int32_t b_offset, int32_t k_offset,
-                                                        int32_t multiplier, int32_t shift, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                        int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_b_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
+inline void run_offset_contribution_output_stage_window(const int32_t     *vector_sum_col_ptr,
+                                                        const int32_t     *vector_sum_row_ptr,
+                                                        const int32_t     *bias_ptr,
+                                                        Iterator           mm_result_it,
+                                                        Iterator           out_it,
+                                                        const int32x4_t    result_offset_s32,
+                                                        const int32x4_t    result_shift_s32,
+                                                        typename VT::vtype min_vec,
+                                                        typename VT::vtype max_vec,
+                                                        int32_t            a_offset,
+                                                        int32_t            b_offset,
+                                                        int32_t            k_offset,
+                                                        int32_t            multiplier,
+                                                        int32_t            shift,
+                                                        int32_t            offset,
+                                                        int32_t            min_bound,
+                                                        int32_t            max_bound,
+                                                        int                window_step_x,
+                                                        int                window_start_x,
+                                                        int                window_end_x,
+                                                        bool               has_a_offset,
+                                                        bool               has_b_offset,
+                                                        bool               has_bias,
+                                                        bool               is_bounded_relu,
+                                                        bool               is_fixed_point)
 {
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
+    int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+    if (!is_fixed_point)
     {
         // Combine quantization offset with other offsets.
         offset_term_s32 = add_s32(offset_term_s32, result_offset_s32);
     }
-    if(has_a_offset && has_b_offset)
+    if (has_a_offset && has_b_offset)
     {
         offset_term_s32 = add_s32(offset_term_s32, get_k_offset(k_offset));
     }
-    if(has_b_offset)
+    if (has_b_offset)
     {
         offset_term_s32 = add_s32(offset_term_s32, get_b_offset(vector_sum_row_ptr, b_offset));
     }
 
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_s32 = add_s32(in_s32, load(bias_ptr, x));
         }
-        if(!is_fixed_point || has_b_offset)
+        if (!is_fixed_point || has_b_offset)
         {
             in_s32 = add_s32(in_s32, offset_term_s32);
         }
-        if(!is_fixed_point)
+        if (!is_fixed_point)
         {
             in_s32 = mul_s32(in_s32, multiplier);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
+            wrapper::vstore(
+                reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                finalize_quantization(in_s32, multiplier, shift, result_offset_s32, min_vec, max_vec, is_bounded_relu));
         }
         else
         {
-            wrapper::vstore(reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
-                            finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
+            wrapper::vstore(
+                reinterpret_cast<typename VT::stype *>(out_it.ptr() + x),
+                finalize_quantization_floating_point(in_s32, result_shift_s32, min_vec, max_vec, is_bounded_relu));
         }
     }
     // Compute left-over elements
-    for(; x < window_end_x; ++x)
+    for (; x < window_end_x; ++x)
     {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+        int32_t in_value =
+            *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_value += (*(vector_sum_col_ptr + x) * a_offset);
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_value += *(bias_ptr + x);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
             // Finalize and store the result
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = finalize_quantization(in_value, multiplier, shift, offset,
-                                                                                              static_cast<typename VT::stype>(min_bound),
-                                                                                              static_cast<typename VT::stype>(max_bound), is_bounded_relu);
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+                finalize_quantization(in_value, multiplier, shift, offset, static_cast<typename VT::stype>(min_bound),
+                                      static_cast<typename VT::stype>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -387,75 +348,100 @@
             in_value = (in_value * multiplier) >> shift;
 
             // Bound and store the result
-            if(is_bounded_relu)
+            if (is_bounded_relu)
             {
-                in_value = static_cast<typename VT::stype>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
+                in_value = static_cast<typename VT::stype>(
+                    std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
             }
-            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) = static_cast<typename VT::stype>(std::max<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
-                                                                                                                          std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
+            *reinterpret_cast<typename VT::stype *>(out_it.ptr() + x) =
+                static_cast<typename VT::stype>(std::max<int32_t>(
+                    static_cast<int32_t>(std::numeric_limits<typename VT::stype>::lowest()),
+                    std::min<int32_t>(static_cast<int32_t>(std::numeric_limits<typename VT::stype>::max()), in_value)));
         }
     }
 }
 
-inline void run_offset_contribution_output_stage_window_symm(const int32_t *vector_sum_col_ptr, const int32_t *bias_ptr, Iterator mm_result_it, Iterator out_it,
-                                                             const int32_t *result_multipliers, const int32_t *result_shifts,
-                                                             const int32x4_t result_offset, int8x16_t min_s8, int8x16_t max_s8,
-                                                             int32_t a_offset, int32_t offset, int32_t min_bound, int32_t max_bound,
-                                                             int window_step_x, int window_start_x, int window_end_x, bool has_a_offset, bool has_bias, bool is_bounded_relu, bool is_fixed_point)
+inline void run_offset_contribution_output_stage_window_symm(const int32_t  *vector_sum_col_ptr,
+                                                             const int32_t  *bias_ptr,
+                                                             Iterator        mm_result_it,
+                                                             Iterator        out_it,
+                                                             const int32_t  *result_multipliers,
+                                                             const int32_t  *result_shifts,
+                                                             const int32x4_t result_offset,
+                                                             int8x16_t       min_s8,
+                                                             int8x16_t       max_s8,
+                                                             int32_t         a_offset,
+                                                             int32_t         offset,
+                                                             int32_t         min_bound,
+                                                             int32_t         max_bound,
+                                                             int             window_step_x,
+                                                             int             window_start_x,
+                                                             int             window_end_x,
+                                                             bool            has_a_offset,
+                                                             bool            has_bias,
+                                                             bool            is_bounded_relu,
+                                                             bool            is_fixed_point)
 {
-    int32x4x4_t offset_term_s32 = { 0, 0, 0, 0 };
-    if(!is_fixed_point)
+    int32x4x4_t offset_term_s32 = {0, 0, 0, 0};
+    if (!is_fixed_point)
     {
         // Combine quantization offset with other offsets.
         offset_term_s32 = add_s32(offset_term_s32, result_offset);
     }
 
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         int32x4x4_t in_s32 = load_results_input(mm_result_it, x);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_s32 = add_s32(in_s32, get_a_offset(vector_sum_col_ptr, a_offset, x));
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_s32 = add_s32(in_s32, load(bias_ptr, x));
         }
-        if(!is_fixed_point)
+        if (!is_fixed_point)
         {
             in_s32 = add_s32(in_s32, offset_term_s32);
             in_s32 = mul_s32(in_s32, result_multipliers + x);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x), result_offset, min_s8, max_s8, is_bounded_relu));
+            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x),
+                     finalize_quantization_symm(in_s32, load(result_multipliers, x), load(result_shifts, x),
+                                                result_offset, min_s8, max_s8, is_bounded_relu));
         }
         else
         {
-            vst1q_s8(reinterpret_cast<int8_t *>(out_it.ptr() + x), finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
+            vst1q_s8(
+                reinterpret_cast<int8_t *>(out_it.ptr() + x),
+                finalize_quantization_floating_point(in_s32, load(result_shifts, x), min_s8, max_s8, is_bounded_relu));
         }
     }
     // Compute left-over elements
-    for(; x < window_end_x; ++x)
+    for (; x < window_end_x; ++x)
     {
-        int32_t in_value = *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
+        int32_t in_value =
+            *(reinterpret_cast<const int32_t *>(mm_result_it.ptr()) + x) + wrapper::vgetlane(offset_term_s32.val[0], 0);
 
-        if(has_a_offset)
+        if (has_a_offset)
         {
             in_value += (*(vector_sum_col_ptr + x) * a_offset);
         }
-        if(has_bias)
+        if (has_bias)
         {
             in_value += *(bias_ptr + x);
         }
 
-        if(is_fixed_point)
+        if (is_fixed_point)
         {
             // Finalize and store the result
-            *(out_it.ptr() + x) = finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset, static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
+            *(out_it.ptr() + x) =
+                finalize_quantization(in_value, result_multipliers[x], result_shifts[x], offset,
+                                      static_cast<int8_t>(min_bound), static_cast<int8_t>(max_bound), is_bounded_relu);
         }
         else
         {
@@ -463,7 +449,7 @@
             in_value = (in_value * result_multipliers[x]) >> (-result_shifts[x]);
 
             // Bound and store the result
-            if(is_bounded_relu)
+            if (is_bounded_relu)
             {
                 in_value = static_cast<int8_t>(std::max<int32_t>(min_bound, std::min<int32_t>(max_bound, in_value)));
             }
@@ -473,10 +459,20 @@
 }
 
 template <typename T>
-void run_offset_contribution_output_stage(const Window &window,
-                                          const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                          int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched,
-                                          GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
+void run_offset_contribution_output_stage(const Window           &window,
+                                          const ITensor          *mm_result,
+                                          const ITensor          *vector_sum_col,
+                                          const ITensor          *vector_sum_row,
+                                          const ITensor          *bias,
+                                          ITensor                *output,
+                                          int32_t                 a_offset,
+                                          int32_t                 b_offset,
+                                          int32_t                 k_offset,
+                                          bool                    is_vector_sum_col_batched,
+                                          GEMMLowpOutputStageInfo output_stage,
+                                          bool                    is_gemm3d,
+                                          bool                    is_bounded_relu,
+                                          bool                    is_fixed_point)
 {
     //  Semantics of XYZW Explained for each tensor
     //
@@ -516,7 +512,7 @@
     Iterator mm_result_it(mm_result, win);
     Iterator out_it(output, win);
 
-    if((a_offset != 0) && (b_offset != 0))
+    if ((a_offset != 0) && (b_offset != 0))
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
@@ -527,45 +523,52 @@
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
         // Offset in case vector_sum_col is batched in y dimension
-        const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
-                                                                   mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()),
+                        mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset,
+                        k_offset, multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x,
+                        window_end_x, true, true, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32,
+                        result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, false,
+                        is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, vector_sum_row_it, mm_result_it, out_it);
         }
     }
-    else if((a_offset == 0) && (b_offset != 0))
+    else if ((a_offset == 0) && (b_offset != 0))
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_row);
 
@@ -573,114 +576,139 @@
 
         const size_t sum_row_stride_y = vector_sum_row->info()->strides_in_bytes().y();
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id = id.z() / depth_input;
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, vector_sum_row_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                        out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+                        multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+                        false, true, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_row_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_row_ptr = reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y)
-                                                + id.y() + (id.z() % depth_input) * height_input;
-                run_offset_contribution_output_stage_window<Typer>(nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_row_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id = id.z() / depth_input;
+                    const auto vector_sum_row_ptr =
+                        reinterpret_cast<const int32_t *>(vector_sum_row_it.ptr() + batch_id * sum_row_stride_y) +
+                        id.y() + (id.z() % depth_input) * height_input;
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, vector_sum_row_ptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+                        min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, false, true, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_row_it, mm_result_it, out_it);
         }
     }
-    else if((a_offset != 0) && (b_offset == 0))
+    else if ((a_offset != 0) && (b_offset == 0))
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
 
         Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
 
         // Offset in case vector_sum_col is batched in y dimension
-        const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
-                                                                   out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it,
+                        out_it, result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset,
+                        multiplier, shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x,
+                        true, false, true, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window<Typer>(vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window<Typer>(
+                        vector_sum_col_ptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32,
+                        min_vec, max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, true, false, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_col_it, mm_result_it, out_it);
         }
     }
     else
     {
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_offset_s32, result_shift_s32, min_vec, max_vec, a_offset, b_offset, k_offset, multiplier,
+                        shift, offset, min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, false,
+                        true, is_bounded_relu, is_fixed_point);
+                },
+                bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window<Typer>(nullptr, nullptr, nullptr, mm_result_it, out_it,
-                                                                   result_offset_s32, result_shift_s32,
-                                                                   min_vec, max_vec, a_offset, b_offset, k_offset,
-                                                                   multiplier, shift, offset, min_bound, max_bound,
-                                                                   window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window<Typer>(
+                        nullptr, nullptr, nullptr, mm_result_it, out_it, result_offset_s32, result_shift_s32, min_vec,
+                        max_vec, a_offset, b_offset, k_offset, multiplier, shift, offset, min_bound, max_bound,
+                        window_step_x, window_start_x, window_end_x, false, false, false, is_bounded_relu,
+                        is_fixed_point);
+                },
+                mm_result_it, out_it);
         }
         return;
     }
 }
 
-void run_offset_contribution_output_stage_symm(const Window &window,
-                                               const ITensor *mm_result, const ITensor *vector_sum_col, const ITensor *vector_sum_row, const ITensor *bias, ITensor *output,
-                                               int32_t a_offset, int32_t b_offset, int32_t k_offset, bool is_vector_sum_col_batched,
-                                               GEMMLowpOutputStageInfo output_stage, bool is_gemm3d, bool is_bounded_relu, bool is_fixed_point)
+void run_offset_contribution_output_stage_symm(const Window           &window,
+                                               const ITensor          *mm_result,
+                                               const ITensor          *vector_sum_col,
+                                               const ITensor          *vector_sum_row,
+                                               const ITensor          *bias,
+                                               ITensor                *output,
+                                               int32_t                 a_offset,
+                                               int32_t                 b_offset,
+                                               int32_t                 k_offset,
+                                               bool                    is_vector_sum_col_batched,
+                                               GEMMLowpOutputStageInfo output_stage,
+                                               bool                    is_gemm3d,
+                                               bool                    is_bounded_relu,
+                                               bool                    is_fixed_point)
 {
     ARM_COMPUTE_UNUSED(vector_sum_row, b_offset, k_offset);
 
@@ -690,8 +718,8 @@
     const int32_t min_bound = output_stage.gemmlowp_min_bound;
     const int32_t max_bound = output_stage.gemmlowp_max_bound;
 
-    const int32_t *result_multipliers = output_stage.gemmlowp_multipliers.data();
-    const int32_t *result_shifts      = output_stage.gemmlowp_shifts.data();
+    const int32_t  *result_multipliers = output_stage.gemmlowp_multipliers.data();
+    const int32_t  *result_shifts      = output_stage.gemmlowp_shifts.data();
     const int32x4_t result_offset_s32  = vdupq_n_s32(offset);
     const int8x16_t min_s8             = vdupq_n_s8(static_cast<int8_t>(min_bound));
     const int8x16_t max_s8             = vdupq_n_s8(static_cast<int8_t>(max_bound));
@@ -708,88 +736,105 @@
     Iterator mm_result_it(mm_result, win);
     Iterator out_it(output, win);
 
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(vector_sum_col);
 
         Iterator vector_sum_col_it = get_vector_sum_col_it(collapsed_window, vector_sum_col);
 
         // Offset in case vector_sum_col is batched in y dimension
-        const int vector_sum_col_stride_batch = is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
+        const int vector_sum_col_stride_batch =
+            is_vector_sum_col_batched ? vector_sum_col->info()->strides_in_bytes().y() : 0;
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window_symm(
+                        vector_sum_col_ptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, true, true, is_bounded_relu,
+                        is_fixed_point);
+                },
+                vector_sum_col_it, bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates & id)
-            {
-                const int  batch_id           = id.z() / depth_input;
-                const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
-                run_offset_contribution_output_stage_window_symm(vector_sum_col_ptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
-            },
-            vector_sum_col_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &id)
+                {
+                    const int  batch_id           = id.z() / depth_input;
+                    const auto vector_sum_col_ptr = reinterpret_cast<const int32_t *>(
+                        vector_sum_col_it.ptr() + batch_id * vector_sum_col_stride_batch);
+                    run_offset_contribution_output_stage_window_symm(
+                        vector_sum_col_ptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts,
+                        result_offset_s32, min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x,
+                        window_start_x, window_end_x, true, false, is_bounded_relu, is_fixed_point);
+                },
+                vector_sum_col_it, mm_result_it, out_it);
         }
     }
     else
     {
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             Iterator bias_it = get_bias_it(collapsed_window, bias);
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu, is_fixed_point);
-            },
-            bias_it, mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window_symm(
+                        nullptr, reinterpret_cast<const int32_t *>(bias_it.ptr()), mm_result_it, out_it,
+                        result_multipliers, result_shifts, result_offset_s32, min_s8, max_s8, a_offset, offset,
+                        min_bound, max_bound, window_step_x, window_start_x, window_end_x, false, true, is_bounded_relu,
+                        is_fixed_point);
+                },
+                bias_it, mm_result_it, out_it);
         }
         else
         {
-            execute_window_loop(collapsed_window, [&](const Coordinates &)
-            {
-                run_offset_contribution_output_stage_window_symm(nullptr, nullptr, mm_result_it, out_it,
-                                                                 result_multipliers, result_shifts,
-                                                                 result_offset_s32, min_s8, max_s8,
-                                                                 a_offset, offset, min_bound, max_bound,
-                                                                 window_step_x, window_start_x, window_end_x, false, false, is_bounded_relu, is_fixed_point);
-            },
-            mm_result_it, out_it);
+            execute_window_loop(
+                collapsed_window,
+                [&](const Coordinates &)
+                {
+                    run_offset_contribution_output_stage_window_symm(
+                        nullptr, nullptr, mm_result_it, out_it, result_multipliers, result_shifts, result_offset_s32,
+                        min_s8, max_s8, a_offset, offset, min_bound, max_bound, window_step_x, window_start_x,
+                        window_end_x, false, false, is_bounded_relu, is_fixed_point);
+                },
+                mm_result_it, out_it);
         }
         return;
     }
 }
 
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                          int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+Status validate_arguments(const ITensorInfo      *mm_result,
+                          const ITensorInfo      *vector_sum_col,
+                          const ITensorInfo      *vector_sum_row,
+                          const ITensorInfo      *bias,
+                          const ITensorInfo      *output,
+                          int32_t                 a_offset,
+                          int32_t                 b_offset,
+                          GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
-    if(output->data_type() != DataType::QASYMM8)
+    if (output->data_type() != DataType::QASYMM8)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 && b_offset != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) > 1 && output_stage.gemmlowp_multipliers.size() > 1 &&
+                                    b_offset != 0);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN && output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
+    ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN &&
+                                output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -797,7 +842,7 @@
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
@@ -805,19 +850,21 @@
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = output->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -828,13 +875,15 @@
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
 
@@ -842,7 +891,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_row->num_dimensions() > 3);
     }
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, output);
@@ -852,15 +901,21 @@
 }
 } // namespace
 
-void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
-                                                               const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                                                               int32_t k, int32_t a_offset, int32_t b_offset,
+void CpuGemmLowpOffsetContributionOutputStageKernel::configure(const ITensorInfo      *mm_result,
+                                                               const ITensorInfo      *vector_sum_col,
+                                                               const ITensorInfo      *vector_sum_row,
+                                                               const ITensorInfo      *bias,
+                                                               ITensorInfo            *dst,
+                                                               int32_t                 k,
+                                                               int32_t                 a_offset,
+                                                               int32_t                 b_offset,
                                                                GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_UNUSED(vector_sum_row, bias);
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage));
 
     _a_offset     = a_offset;
     _b_offset     = b_offset;
@@ -868,7 +923,7 @@
     _output_stage = output_stage;
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         // Check if vector_sum_col_shape should be slidden or not
         // Don't slide vector_sum_col_shape along the y dimension if vector_sum_col_shape has just 1 dimension and vector_sum_row_shape more than 1
@@ -888,16 +943,24 @@
     ICpuKernel::configure(win);
 }
 
-Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col,
-                                                                const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                int32_t a_offset, int32_t b_offset, GEMMLowpOutputStageInfo output_stage)
+Status CpuGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo      *mm_result,
+                                                                const ITensorInfo      *vector_sum_col,
+                                                                const ITensorInfo      *vector_sum_row,
+                                                                const ITensorInfo      *bias,
+                                                                const ITensorInfo      *output,
+                                                                int32_t                 a_offset,
+                                                                int32_t                 b_offset,
+                                                                GEMMLowpOutputStageInfo output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, output, a_offset, b_offset, output_stage));
     return Status{};
 }
 
-void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack      &tensors,
+                                                            const Window     &window,
+                                                            const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -912,14 +975,14 @@
     PixelValue type_min{};
     PixelValue type_max{};
     std::tie(type_min, type_max) = get_min_max(dst->info()->data_type());
-    int32_t type_min_int = type_min.get<int32_t>();
-    int32_t type_max_int = type_max.get<int32_t>();
+    int32_t type_min_int         = type_min.get<int32_t>();
+    int32_t type_max_int         = type_max.get<int32_t>();
 
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->info()->num_dimensions() > 1
-                                   && mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->info()->num_dimensions() > 1 &&
+                                   mm_result->info()->tensor_shape().y() != vector_sum_row->info()->tensor_shape().x();
 
-    const bool is_bounded_relu = !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
+    const bool is_bounded_relu =
+        !(_output_stage.gemmlowp_min_bound <= type_min_int && _output_stage.gemmlowp_max_bound >= type_max_int);
 
     // Check if we need to perform fixed point requantization
     const bool is_fixed_point = _output_stage.type != GEMMLowpOutputStageType::QUANTIZE_DOWN;
@@ -930,22 +993,25 @@
     // Check if symmetric per-channel execution
     const bool is_symm = _output_stage.is_quantized_per_channel;
 
-    if(is_symm)
+    if (is_symm)
     {
-        run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
-                                                  reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+        run_offset_contribution_output_stage_symm(window, mm_result, vector_sum_col, vector_sum_row, bias, dst,
+                                                  _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched,
+                                                  _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
     }
     else
     {
-        if(is_signed)
+        if (is_signed)
         {
-            run_offset_contribution_output_stage<int8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
-                                                         reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+            run_offset_contribution_output_stage<int8_t>(
+                window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset,
+                _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
         }
         else
         {
-            run_offset_contribution_output_stage<uint8_t>(window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset, _is_vector_sum_col_batched, _output_stage,
-                                                          reinterpret_as_3d, is_bounded_relu, is_fixed_point);
+            run_offset_contribution_output_stage<uint8_t>(
+                window, mm_result, vector_sum_col, vector_sum_row, bias, dst, _a_offset, _b_offset, _k_offset,
+                _is_vector_sum_col_batched, _output_stage, reinterpret_as_3d, is_bounded_relu, is_fixed_point);
         }
     }
 }

diff --git a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
index 3cb99fa..af477d4 100644
--- a/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpOffsetContributionOutputStageKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_OFFSETCONTRIBUTION_OUTPUTSTAGE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -85,7 +86,13 @@
      * @param[in]  b_offset       Offset to be added to each element of the matrix B.
      * @param[in]  output_stage   GEMMLowp output stage info, providing the type of quantization and the necessary parameters.
      */
-    void configure(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, int32_t k, int32_t a_offset,
+    void configure(const ITensorInfo      *mm_result,
+                   const ITensorInfo      *vector_sum_col,
+                   const ITensorInfo      *vector_sum_row,
+                   const ITensorInfo      *bias,
+                   ITensorInfo            *dst,
+                   int32_t                 k,
+                   int32_t                 a_offset,
                    int32_t                 b_offset,
                    GEMMLowpOutputStageInfo output_stage);
     /** Static function to check if given info will lead to a valid configuration
@@ -94,21 +101,26 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
+    static Status validate(const ITensorInfo      *mm_result,
+                           const ITensorInfo      *vector_sum_col,
+                           const ITensorInfo      *vector_sum_row,
+                           const ITensorInfo      *bias,
+                           const ITensorInfo      *dst,
+                           int32_t                 a_offset,
                            int32_t                 b_offset,
                            GEMMLowpOutputStageInfo output_stage);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
     /** Function to use for the particular tensors passed to configure() */
-    int32_t                 _a_offset{ 0 };
-    int32_t                 _b_offset{ 0 };
-    int32_t                 _k_offset{ 0 };
-    bool                    _is_vector_sum_col_batched{ true };
-    GEMMLowpOutputStageInfo _output_stage{ GEMMLowpOutputStageInfo() };
+    int32_t                 _a_offset{0};
+    int32_t                 _b_offset{0};
+    int32_t                 _k_offset{0};
+    bool                    _is_vector_sum_col_batched{true};
+    GEMMLowpOutputStageInfo _output_stage{GEMMLowpOutputStageInfo()};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
index 3023d93..eefc294 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.cpp

@@ -28,13 +28,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+
 #include "src/core/AccessWindowStatic.h"
-#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <arm_neon.h>
 
@@ -46,26 +47,35 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+        output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        if(dst->data_type() != output_stage->output_data_type && (output_stage->output_data_type == DataType::QASYMM8 || output_stage->output_data_type == DataType::QASYMM8_SIGNED))
+        if (dst->data_type() != output_stage->output_data_type &&
+            (output_stage->output_data_type == DataType::QASYMM8 ||
+             output_stage->output_data_type == DataType::QASYMM8_SIGNED))
         {
             ARM_COMPUTE_RETURN_ERROR_MSG("Mismatching data types");
         }
@@ -92,24 +102,26 @@
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, uint8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
+inline
+    typename std::enable_if<std::is_same<T, uint8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+    convert_to_8bit(const int16x8x2_t in_s16)
 {
     return wrapper::vcombine(wrapper::vqmovun(in_s16.val[0]), wrapper::vqmovun(in_s16.val[1]));
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, int8_t>::value,
-       typename wrapper::traits::neon_vector<T, 16>::type>::type
-       convert_to_8bit(const int16x8x2_t in_s16)
+inline typename std::enable_if<std::is_same<T, int8_t>::value, typename wrapper::traits::neon_vector<T, 16>::type>::type
+convert_to_8bit(const int16x8x2_t in_s16)
 {
     return wrapper::vcombine(wrapper::vqmovn(in_s16.val[0]), wrapper::vqmovn(in_s16.val[1]));
 }
 
 template <typename T>
-inline typename wrapper::traits::neon_vector<T, 16>::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector<T, 16>::type min,
-                                                                                typename wrapper::traits::neon_vector<T, 16>::type max)
+inline typename wrapper::traits::neon_vector<T, 16>::type
+finalize_quantization(int32x4x4_t                                       &in_s32,
+                      int32x4_t                                          result_shift_s32,
+                      typename wrapper::traits::neon_vector<T, 16>::type min,
+                      typename wrapper::traits::neon_vector<T, 16>::type max)
 {
     // Shift final result (negative value shift right)
     in_s32.val[0] = vshlq_s32(in_s32.val[0], result_shift_s32);
@@ -118,13 +130,8 @@
     in_s32.val[3] = vshlq_s32(in_s32.val[3], result_shift_s32);
 
     // Convert S32 to S16
-    const int16x8x2_t in_s16 =
-    {
-        {
-            vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
-            vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))
-        }
-    };
+    const int16x8x2_t in_s16 = {{vcombine_s16(vqmovn_s32(in_s32.val[0]), vqmovn_s32(in_s32.val[1])),
+                                 vcombine_s16(vqmovn_s32(in_s32.val[2]), vqmovn_s32(in_s32.val[3]))}};
 
     // Convert S16 to S8 or U8
     typename wrapper::traits::neon_vector<T, 16>::type out = convert_to_8bit<T>(in_s16);
@@ -137,7 +144,10 @@
 } // namespace
 
 template <typename T>
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal(const ITensor *src,
+                                                           const ITensor *bias,
+                                                           ITensor       *dst,
+                                                           const Window  &window)
 {
     using VectorType = typename wrapper::traits::neon_vector<T, 16>::type;
 
@@ -159,107 +169,105 @@
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
 
-                const int32x4x4_t bias_s32 =
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    // Add the offset terms to GEMM's result and multiply by result_mult_int
+                    scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+                                    finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
+                    const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
+                    int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
 
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+                    // Quantize
+                    in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) *
+                                _output_stage->gemmlowp_multiplier) >>
+                               _output_stage->gemmlowp_shift;
 
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
-
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int bias_value = *(reinterpret_cast<const int *>(bias_i.ptr()) + x);
-                int       in_value   = *(reinterpret_cast<const int *>(in.ptr()) + x);
-
-                // Quantize
-                in_value = ((in_value + bias_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
-
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, bias_i, out);
+                    // Store the result
+                    *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+                }
+            },
+            in, bias_i, out);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
 
-                // Add the offset terms to GEMM's result and multiply by result_mult_int
-                scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
+                    // Add the offset terms to GEMM's result and multiply by result_mult_int
+                    scale_input(in_s32, result_offset_s32, _output_stage->gemmlowp_multiplier);
 
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x), finalize_quantization<T>(in_s32, result_shift_s32, min, max));
-            }
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr() + x),
+                                    finalize_quantization<T>(in_s32, result_shift_s32, min, max));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    int in_value = *(reinterpret_cast<const int *>(in.ptr()) + x);
 
-                // Quantize
-                in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >> _output_stage->gemmlowp_shift;
+                    // Quantize
+                    in_value = ((in_value + _output_stage->gemmlowp_offset) * _output_stage->gemmlowp_multiplier) >>
+                               _output_stage->gemmlowp_shift;
 
-                // Store the result
-                *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
-            }
-        },
-        in, out);
+                    // Store the result
+                    *(out.ptr() + x) = static_cast<T>(utility::clamp<int>(in_value, clamp_min, clamp_max));
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+void CpuGemmLowpQuantizeDownInt32ScaleKernel::configure(ITensorInfo                   *src,
+                                                        ITensorInfo                   *bias,
+                                                        ITensorInfo                   *dst,
+                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validate step
@@ -268,10 +276,7 @@
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
 
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  bias,
-                                                  dst,
-                                                  output_stage));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
 
     _output_stage = output_stage;
 
@@ -281,14 +286,17 @@
     ICpuKernel::configure(win);
 
     // Check if we need to clamp the result using min and max
-    _is_bounded_relu = ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound)
-                        && !(_output_stage->gemmlowp_min_bound == std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                             && _output_stage->gemmlowp_max_bound == std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
-    if(_output_stage->output_data_type == DataType::QASYMM8)
+    _is_bounded_relu =
+        ((_output_stage->gemmlowp_min_bound != _output_stage->gemmlowp_max_bound) &&
+         !(_output_stage->gemmlowp_min_bound ==
+               std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) &&
+           _output_stage->gemmlowp_max_bound ==
+               std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))));
+    if (_output_stage->output_data_type == DataType::QASYMM8)
     {
         _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<uint8_t>;
     }
-    else if(_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
+    else if (_output_stage->output_data_type == DataType::QASYMM8_SIGNED)
     {
         _func = &CpuGemmLowpQuantizeDownInt32ScaleKernel::run_internal<int8_t>;
     }
@@ -298,7 +306,10 @@
     }
 }
 
-Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status CpuGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo             *src,
+                                                         const ITensorInfo             *bias,
+                                                         const ITensorInfo             *dst,
+                                                         const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
     return Status{};
@@ -323,4 +334,4 @@
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
index c7813ed..33e296b 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -71,10 +72,13 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *output_stage);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -95,11 +99,14 @@
      * @param[out] dst    Output tensor info
      * @param[in]  window Region on which to execute the kernel.
      */
-    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+    using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ScaleKernel::*)(const ITensor *src,
+                                                                                      const ITensor *bias,
+                                                                                      ITensor       *dst,
+                                                                                      const Window  &window);
 
-    QuantizeDownFunctionPtr        _func{ nullptr };
-    const GEMMLowpOutputStageInfo *_output_stage{ nullptr };
-    bool                           _is_bounded_relu{ false };
+    QuantizeDownFunctionPtr        _func{nullptr};
+    const GEMMLowpOutputStageInfo *_output_stage{nullptr};
+    bool                           _is_bounded_relu{false};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
index 53ca991..a5c09c9 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.cpp

@@ -29,12 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NESymm.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NESymm.h"
 
 #include <arm_neon.h>
 
@@ -53,14 +54,14 @@
     ARM_COMPUTE_RETURN_ERROR_ON(min > max);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@
 } // namespace
 
 template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                              const ITensor *bias,
+                                                                              ITensor       *dst,
+                                                                              const Window  &window)
 {
     const int16x8_t min_s16 = vdupq_n_s16(static_cast<int16_t>(_min));
     const int16x8_t max_s16 = vdupq_n_s16(static_cast<int16_t>(_max));
@@ -88,92 +92,92 @@
 
     Iterator in(src, win_collapsed);
     Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x2_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
+                    int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
 
-                const int32x4x2_t bias_s32 =
+                    const int32x4x2_t bias_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                                                   vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+
+                    vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                              finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+                                                                           _result_shift, min_s16, max_s16));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4)
-                    }
-                };
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out, bias_i);
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+                        static_cast<int16_t>(_max));
+                }
+            },
+            in, out, bias_i);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x2_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)
-                    }
-                };
+                    int32x4x2_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4)}};
 
-                vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x, finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier, _result_shift, min_s16, max_s16));
-            }
+                    vst1q_s16(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                              finalize_quantization_int16<is_bounded_relu>(in_s32, _result_fixedpoint_multiplier,
+                                                                           _result_shift, min_s16, max_s16));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-                ARM_COMPUTE_UNUSED(in_value);
-                // Finalize and store the result
-                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
-                                                                                                             static_cast<int16_t>(_max));
-            }
-        },
-        in, out);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+                    ARM_COMPUTE_UNUSED(in_value);
+                    // Finalize and store the result
+                    *(reinterpret_cast<int16_t *>(out.ptr()) + x) = finalize_quantization_int16<is_bounded_relu>(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, static_cast<int16_t>(_min),
+                        static_cast<int16_t>(_max));
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                           int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                           ITensorInfo *bias,
+                                                                           ITensorInfo *dst,
+                                                                           int          result_fixedpoint_multiplier,
+                                                                           int          result_shift,
+                                                                           int          min,
+                                                                           int          max)
 {
     // Perform validate step
     ARM_COMPUTE_UNUSED(bias, dst);
@@ -193,18 +197,21 @@
 
     // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = !(min <= -32768 && max >= 32767);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_internal<false>;
 }
 
-Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+    const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
     return Status{};
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                        const Window     &window,
+                                                                        const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
index 681d099..925788b 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT16_SCALEBYFIXEDPOINT_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -48,7 +49,8 @@
  *  -# Clamp the resulting int32 values to the [-32768, 32767] range and cast to QSYMM16.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel() = default;
@@ -65,17 +67,24 @@
      * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QSYMM16.
      *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions. Defaults to 0.
      */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int min = 0, int max = 0);
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          min = 0,
+                   int          max = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -97,13 +106,13 @@
      * @param[in]  window Region on which to execute the kernel.
      */
     using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
 
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _min{0};
+    int                     _max{0};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
index 27214dc..0e58097 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.cpp

@@ -29,12 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 #include <arm_neon.h>
 
@@ -53,14 +54,14 @@
     ARM_COMPUTE_RETURN_ERROR_ON(min > max);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8_SIGNED);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@
 } // namespace
 
 template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                             const ITensor *bias,
+                                                                             ITensor       *dst,
+                                                                             const Window  &window)
 {
     const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
     const int8x16_t min_s8                        = vdupq_n_s8(static_cast<int8_t>(_min));
@@ -88,102 +92,102 @@
 
     Iterator in(src, win_collapsed);
     Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
 
-                const int32x4x4_t bias_s32 =
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias_i);
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                        static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out, bias_i);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
 
-                vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
-                         finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
-            }
+                    vst1q_s8(reinterpret_cast<int8_t *>(out.ptr() + x),
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_s8, max_s8, is_bounded_relu));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
-                // Finalize and store the result
-                *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
-                                                                                   static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
+                    // Finalize and store the result
+                    *reinterpret_cast<int8_t *>(out.ptr() + x) = finalize_quantization(
+                        in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift,
+                        static_cast<int8_t>(_min), static_cast<int8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                          int result_offset_after_shift, int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                          ITensorInfo *bias,
+                                                                          ITensorInfo *dst,
+                                                                          int          result_fixedpoint_multiplier,
+                                                                          int          result_shift,
+                                                                          int          result_offset_after_shift,
+                                                                          int          min,
+                                                                          int          max)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validate step
@@ -205,18 +209,21 @@
 
     // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = !(min <= -128 && max >= 127);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_internal<false>;
 }
 
-Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
     return Status{};
 }
 
-void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                       const Window     &window,
+                                                                       const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);

diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
index 3e615b9..6a67ba4 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOINT8_SCALEBYFIXEDPOINT_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -49,7 +50,8 @@
  *  -# Clamp the resulting int32 values to the [-128..127] range and cast to QASYMM8_SIGNED.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel() = default;
@@ -67,17 +69,25 @@
      * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8_SIGNED,
      *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          result_offset_after_shift,
+                   int          min = 0,
+                   int          max = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -99,14 +109,14 @@
      * @param[in]  window Region on which to execute the kernel.
      */
     using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
 
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _result_offset_after_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _result_offset_after_shift{0};
+    int                     _min{0};
+    int                     _max{0};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index e49fd29..e3dd224 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp

@@ -29,12 +29,13 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/NEON/NEAsymm.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEAsymm.h"
 
 #include <arm_neon.h>
 
@@ -53,14 +54,14 @@
     ARM_COMPUTE_RETURN_ERROR_ON(min > max);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, src);
@@ -71,7 +72,10 @@
 } // namespace
 
 template <bool is_bounded_relu>
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal(const ITensor *src,
+                                                                              const ITensor *bias,
+                                                                              ITensor       *dst,
+                                                                              const Window  &window)
 {
     const int32x4_t  result_offset_after_shift_s32 = vdupq_n_s32(_result_offset_after_shift);
     const uint8x16_t min_u8                        = vdupq_n_u8(static_cast<uint8_t>(_min));
@@ -89,98 +93,102 @@
 
     Iterator in(src, win_collapsed);
     Iterator out(dst, win_collapsed);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window win_biases;
         win_biases.set(Window::DimX, Window::Dimension(0, 1, 1));
         win_biases.set(Window::DimY, Window::Dimension(0, 1, 1));
 
         Iterator bias_i(bias, win_biases);
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
 
-                const int32x4x4_t bias_s32 =
+                    const int32x4x4_t bias_s32 = {
+                        {vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
+                         vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)}};
+
+                    // Add the bias to GEMM's result
+                    in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
+                    in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
+                    in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
+                    in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
+
+                    vst1q_u8(out.ptr() + x,
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x + 12)
-                    }
-                };
+                    const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
+                    int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
-                // Add the bias to GEMM's result
-                in_s32.val[0] = vaddq_s32(in_s32.val[0], bias_s32.val[0]);
-                in_s32.val[1] = vaddq_s32(in_s32.val[1], bias_s32.val[1]);
-                in_s32.val[2] = vaddq_s32(in_s32.val[2], bias_s32.val[2]);
-                in_s32.val[3] = vaddq_s32(in_s32.val[3], bias_s32.val[3]);
-
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t bias_value = *(reinterpret_cast<const int32_t *>(bias_i.ptr()) + x);
-                int32_t       in_value   = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
-
-                // Add bias
-                in_value += bias_value;
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out, bias_i);
+                    // Add bias
+                    in_value += bias_value;
+                    // Finalize and store the result
+                    *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+                                                             _result_offset_after_shift, static_cast<uint8_t>(_min),
+                                                             static_cast<uint8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out, bias_i);
     }
     else
     {
-        execute_window_loop(win_collapsed, [&](const Coordinates &)
-        {
-            // Compute 16 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win_collapsed,
+            [&](const Coordinates &)
             {
-                int32x4x4_t in_s32 =
+                // Compute 16 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
-                        vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)
-                    }
-                };
+                    int32x4x4_t in_s32 = {{vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 0),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 4),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 8),
+                                           vld1q_s32(reinterpret_cast<const int32_t *>(in.ptr()) + x + 12)}};
 
-                vst1q_u8(out.ptr() + x, finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift, result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
-            }
+                    vst1q_u8(out.ptr() + x,
+                             finalize_quantization(in_s32, _result_fixedpoint_multiplier, _result_shift,
+                                                   result_offset_after_shift_s32, min_u8, max_u8, is_bounded_relu));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const int32_t in_value = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
 
-                // Finalize and store the result
-                *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, static_cast<uint8_t>(_min), static_cast<uint8_t>(_max), is_bounded_relu);
-            }
-        },
-        in, out);
+                    // Finalize and store the result
+                    *(out.ptr() + x) = finalize_quantization(in_value, _result_fixedpoint_multiplier, _result_shift,
+                                                             _result_offset_after_shift, static_cast<uint8_t>(_min),
+                                                             static_cast<uint8_t>(_max), is_bounded_relu);
+                }
+            },
+            in, out);
     }
 }
 
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift,
-                                                                           int result_offset_after_shift, int min, int max)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(ITensorInfo *src,
+                                                                           ITensorInfo *bias,
+                                                                           ITensorInfo *dst,
+                                                                           int          result_fixedpoint_multiplier,
+                                                                           int          result_shift,
+                                                                           int          result_offset_after_shift,
+                                                                           int          min,
+                                                                           int          max)
 {
     ARM_COMPUTE_UNUSED(bias);
     // Perform validate step
@@ -202,18 +210,21 @@
 
     // Check if we need to clamp the result using min and max
     const bool is_bounded_relu = !(min <= 0 && max >= 255);
-    _func                      = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true> :
-                                 &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
+    _func = is_bounded_relu ? &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<true>
+                            : &CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_internal<false>;
 }
 
-Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
+Status CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+    const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, min, max));
     return Status{};
 }
 
-void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+void CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                        const Window     &window,
+                                                                        const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
@@ -233,4 +244,4 @@
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
index b773fdf..45bd742 100644
--- a/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h
+++ b/src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_QUANTIZEDOWN_INT32TOUINT8_SCALEBYFIXEDPOINT_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -49,7 +50,8 @@
  *  -# Clamp the resulting int32 values to the [0..255] range and cast to QASYMM8.
  *
  */
-class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
+class CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
+    : public ICpuKernel<CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>
 {
 public:
     CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel() = default;
@@ -67,17 +69,25 @@
      * @param[in]  max                          (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
      *                                          Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
-    void configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, int min = 0, int max = 0);
+    void configure(ITensorInfo *src,
+                   ITensorInfo *bias,
+                   ITensorInfo *dst,
+                   int          result_fixedpoint_multiplier,
+                   int          result_shift,
+                   int          result_offset_after_shift,
+                   int          min = 0,
+                   int          max = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, int min = 0, int max = 0);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -93,14 +103,14 @@
      * @param[in] window Region on which to execute the kernel.
      */
     using QuantizeDownFunctionPtr = void (CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::*)(
-                                        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
+        const ITensor *src, const ITensor *bias, ITensor *dst, const Window &window);
 
-    QuantizeDownFunctionPtr _func{ nullptr };
-    int                     _result_fixedpoint_multiplier{ 0 };
-    int                     _result_shift{ 0 };
-    int                     _result_offset_after_shift{ 0 };
-    int                     _min{ 0 };
-    int                     _max{ 0 };
+    QuantizeDownFunctionPtr _func{nullptr};
+    int                     _result_fixedpoint_multiplier{0};
+    int                     _result_shift{0};
+    int                     _result_offset_after_shift{0};
+    int                     _min{0};
+    int                     _max{0};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
index 6399ebb..fb1b70b 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp

@@ -26,11 +26,12 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
-#include "src/core/NEON/NEFixedPoint.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/NEFixedPoint.h"
 #include "src/cpu/kernels/gemm_matrix_add/list.h"
 namespace arm_compute
 {
@@ -40,24 +41,12 @@
 {
 namespace
 {
-static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels =
-{
-    {
-        "neon_fp32_gemm_matrix_add",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32);
-        },
-        REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)
-    },
-    {
-        "neon_fp16_gemm_matrix_add",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.fp16;
-        },
-        REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)
-    },
+static const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> available_kernels = {
+    {"neon_fp32_gemm_matrix_add", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_gemm_matrix_add)},
+    {"neon_fp16_gemm_matrix_add",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_gemm_matrix_add)},
 
 };
 } // namespace
@@ -71,7 +60,8 @@
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmMatrixAdditionKernel::validate(src, dst, beta));
 
     _beta         = beta;
-    const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuGemmMatrixAdditionKernel::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     _func = uk->ukernel;
     // Configure kernel window
@@ -87,7 +77,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -105,7 +95,7 @@
     const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC);
     ITensor       *dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_beta != 0.0f)
+    if (_beta != 0.0f)
     {
         (*_func)(src, dst, window, _beta);
     }
@@ -116,7 +106,8 @@
     return "CpuGemmMatrixAdditionKernel";
 }
 
-const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &CpuGemmMatrixAdditionKernel::get_available_kernels()
+const std::vector<CpuGemmMatrixAdditionKernel::GemmMatrixAddKernel> &
+CpuGemmMatrixAdditionKernel::get_available_kernels()
 {
     return available_kernels;
 }

diff --git a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
index cbc5b53..5e12f1d 100644
--- a/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixAdditionKernel.h

@@ -75,7 +75,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     static const std::vector<GemmMatrixAddKernel> &get_available_kernels();
@@ -89,8 +89,8 @@
      * @param[in]  beta   Weight of matrix C
      */
     /** Matrix addition function to use for the particular tensor types passed to configure() */
-    GemmMatrixAddKernelPtr _func{ nullptr };
-    float                  _beta{ 0.f };
+    GemmMatrixAddKernelPtr _func{nullptr};
+    float                  _beta{0.f};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
index 03b372e..beccd94 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp

@@ -26,10 +26,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/gemm_matrix_mul/list.h"
@@ -42,27 +43,20 @@
 {
 namespace
 {
-static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels =
-{
-    {
-        "neon_fp32_gemm_matrix_mul",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F32);
-        },
-        REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)
-    },
-    {
-        "neon_fp16_gemm_matrix_mul",
-        [](const DataTypeISASelectorData & data)
-        {
-            return (data.dt == DataType::F16) && data.isa.fp16;
-        },
-        REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)
-    },
+static const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> available_kernels = {
+    {"neon_fp32_gemm_matrix_mul", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_gemm_matrix_mul)},
+    {"neon_fp16_gemm_matrix_mul",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_gemm_matrix_mul)},
 };
 
-inline Status validate_arguments(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+inline Status validate_arguments(const ITensorInfo     *lhs,
+                                 const ITensorInfo     *rhs,
+                                 const ITensorInfo     *dst,
+                                 float                  alpha,
+                                 bool                   is_interleaved,
+                                 const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
 
@@ -70,11 +64,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
 
-    if(!is_interleaved)
+    if (!is_interleaved)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(0) != rhs->dimension(1));
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(rhs->dimension(0) != dst->dimension(0));
             ARM_COMPUTE_RETURN_ERROR_ON(lhs->dimension(1) != dst->dimension(1));
@@ -90,28 +84,31 @@
         const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height();
 
         /* Interleave */
-        TensorShape tensor_shape0{ lhs->tensor_shape() };
+        TensorShape tensor_shape0{lhs->tensor_shape()};
         tensor_shape0.set(0, k);
         tensor_shape0.set(1, m);
 
         const TensorInfo tensor_info0          = lhs->clone()->set_tensor_shape(tensor_shape0);
-        const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
+        const TensorInfo tensor_info_reshaped0 = lhs->clone()->set_tensor_shape(
+            misc::shape_calculator::compute_interleaved_shape(tensor_info0, mult_interleave4x4_height));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lhs, &tensor_info_reshaped0);
 
-        if(n != 0) /* Transpose */
+        if (n != 0) /* Transpose */
         {
-            TensorShape tensor_shape1{ rhs->tensor_shape() };
+            TensorShape tensor_shape1{rhs->tensor_shape()};
             tensor_shape1.set(0, n);
             tensor_shape1.set(1, k);
 
-            const TensorInfo tensor_info1          = rhs->clone()->set_tensor_shape(tensor_shape1);
-            const TensorInfo tensor_info_reshaped1 = rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width));
+            const TensorInfo tensor_info1 = rhs->clone()->set_tensor_shape(tensor_shape1);
+            const TensorInfo tensor_info_reshaped1 =
+                rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transpose1xW_with_element_size_shape(
+                    tensor_info1, mult_transpose1xW_width));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(rhs, &tensor_info_reshaped1);
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
-            if(n != 0)
+            if (n != 0)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(dst->dimension(0) != static_cast<size_t>(n));
             }
@@ -125,12 +122,17 @@
 
 } // namespace
 
-void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info)
+void CpuGemmMatrixMultiplyKernel::configure(const ITensorInfo     *lhs,
+                                            const ITensorInfo     *rhs,
+                                            ITensorInfo           *dst,
+                                            float                  alpha,
+                                            bool                   is_interleaved,
+                                            const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
 
     // dst tensor auto inizialitation if not yet initialized
-    TensorShape tensor_shape{ lhs->tensor_shape() };
+    TensorShape tensor_shape{lhs->tensor_shape()};
     tensor_shape.set(0, is_interleaved ? reshape_info.n() : rhs->dimension(0));
     tensor_shape.set(1, is_interleaved ? reshape_info.m() : lhs->dimension(1));
 
@@ -146,7 +148,7 @@
 
     // Check if the dst tensor is a vector. If so,the kernel runs the vector-matrix multiplication
     const bool is_dst_vector = (dst->dimension(1) == 1);
-    if(is_dst_vector)
+    if (is_dst_vector)
     {
         const unsigned int num_elems_processed_per_iteration_x = (lhs->data_type() == DataType::F32) ? 16 : 32;
 
@@ -157,17 +159,23 @@
         constexpr unsigned int num_elems_processed_per_iteration_x = 8;
         constexpr unsigned int num_elems_processed_per_iteration_y = 4;
 
-        win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+        win =
+            calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
     }
 
-    const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ lhs->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuGemmMatrixMultiplyKernel::get_implementation(
+        DataTypeISASelectorData{lhs->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     _func = uk->ukernel;
 
     ICPPKernel::configure(win);
 }
 
-Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved,
+Status CpuGemmMatrixMultiplyKernel::validate(const ITensorInfo     *lhs,
+                                             const ITensorInfo     *rhs,
+                                             const ITensorInfo     *dst,
+                                             float                  alpha,
+                                             bool                   is_interleaved,
                                              const GEMMReshapeInfo &reshape_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(lhs, rhs, dst, alpha, is_interleaved, reshape_info));
@@ -195,7 +203,8 @@
     return "CpuGemmMatrixMultiplyKernel";
 }
 
-const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &CpuGemmMatrixMultiplyKernel::get_available_kernels()
+const std::vector<CpuGemmMatrixMultiplyKernel::GemmMatrixMulKernel> &
+CpuGemmMatrixMultiplyKernel::get_available_kernels()
 {
     return available_kernels;
 }

diff --git a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
index a7dfec8..765fcb8 100644
--- a/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h
+++ b/src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h

@@ -42,7 +42,8 @@
 class CpuGemmMatrixMultiplyKernel : public ICpuKernel<CpuGemmMatrixMultiplyKernel>
 {
 private:
-    using GemmMatrixMulKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type;
+    using GemmMatrixMulKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const Window &, const ThreadInfo &, float, const bool)>::type;
 
 public:
     struct GemmMatrixMulKernel
@@ -67,17 +68,27 @@
      * @param[in]  is_interleaved (Optional) True if lhs and rhs have been reshaped respectively using @ref CpuGemmInterleave4x4Kernel and @ref CpuGemmTranspose1xWKernel
      * @param[in]  reshape_info   (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how @p lhs and @p rhs have been reshaped
      */
-    void configure(const ITensorInfo *lhs, const ITensorInfo *rhs, ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
+    void configure(const ITensorInfo     *lhs,
+                   const ITensorInfo     *rhs,
+                   ITensorInfo           *dst,
+                   float                  alpha,
+                   bool                   is_interleaved,
+                   const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmMatrixMultiplyKernel
      *
      * Similar to @ref CpuGemmMatrixMultiplyKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, float alpha, bool is_interleaved, const GEMMReshapeInfo &reshape_info);
+    static Status validate(const ITensorInfo     *lhs,
+                           const ITensorInfo     *rhs,
+                           const ITensorInfo     *dst,
+                           float                  alpha,
+                           bool                   is_interleaved,
+                           const GEMMReshapeInfo &reshape_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     static const std::vector<GemmMatrixMulKernel> &get_available_kernels();
@@ -94,8 +105,8 @@
      */
 
     /** Matrix multiply function to use for the particular tensor types passed to configure() */
-    GemmMatrixMulKernelPtr _func{ nullptr };
-    float                  _alpha{ 1.f };
+    GemmMatrixMulKernelPtr _func{nullptr};
+    float                  _alpha{1.f};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
index 62d5d5f..c47746b 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.cpp

@@ -24,9 +24,10 @@
 #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -63,9 +64,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_transpose1xW_with_element_size_shape(*src));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           compute_transpose1xW_with_element_size_shape(*src));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -107,25 +109,28 @@
     const size_t out_stride   = dst->info()->strides_in_bytes()[1];
     const size_t vector_size  = 16 / element_size;
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const uint8_t *in_ptr  = in.ptr();
-        uint8_t *const out_ptr = out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
-
-        for(size_t k = 0; k < vector_size; ++k)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // If the src width is not multiple of W, we fill the reference with 0s
-            if((id.x() + k) >= in_width)
+            const uint8_t *in_ptr = in.ptr();
+            uint8_t *const out_ptr =
+                out.ptr() + (id.y() * vector_size) * element_size + (id.x() / vector_size) * out_stride;
+
+            for (size_t k = 0; k < vector_size; ++k)
             {
-                std::memset(out_ptr + k * element_size, 0, element_size);
+                // If the src width is not multiple of W, we fill the reference with 0s
+                if ((id.x() + k) >= in_width)
+                {
+                    std::memset(out_ptr + k * element_size, 0, element_size);
+                }
+                else
+                {
+                    std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
+                }
             }
-            else
-            {
-                std::memcpy(out_ptr + k * element_size, in_ptr + k * element_size, element_size);
-            }
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
 const char *CpuGemmTranspose1xWKernel::name() const

diff --git a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
index 0ca9264..4b834b2 100644
--- a/src/cpu/kernels/CpuGemmTranspose1xWKernel.h
+++ b/src/cpu/kernels/CpuGemmTranspose1xWKernel.h

@@ -88,7 +88,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuIm2ColKernel.cpp b/src/cpu/kernels/CpuIm2ColKernel.cpp
index 9ac2915..55ac7c5 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.cpp
+++ b/src/cpu/kernels/CpuIm2ColKernel.cpp

@@ -29,13 +29,13 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>
@@ -51,26 +51,34 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                          bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *output,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups,
+                          unsigned int         input_pad_right)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(input->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Number of groups greater than one are not supported on Neon");
 
     // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions
-    const unsigned int width_idx    = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
-    const unsigned     total_width  = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
+    const unsigned int width_idx   = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
+    const unsigned     total_width = input->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right();
     const unsigned     total_height = input->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
     ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
-        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
+        TensorInfo expected_output = output->clone()->set_tensor_shape(compute_im2col_conv_shape(
+            input, kernel_dims, conv_info, has_bias, dilation, false, num_groups, input_pad_right));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
@@ -106,14 +114,14 @@
     // This for loop linearize a volume with 3 slices. This allows:
     // 1) to reduce the iterations of the outer for loop "d"
     // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs
-    for(; d <= (kernel_depth - 3); d += 3)
+    for (; d <= (kernel_depth - 3); d += 3)
     {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
+        for (int y = top_left_y; y < y_e; y += dilation_y)
         {
-            if((y < 0 || y >= input_h) && has_pads)
+            if ((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be the offset (will be zeros when not quantized)
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
                     *(out_ptr + 0 * kernel_size2) = pad_value;
                     *(out_ptr + 1 * kernel_size2) = pad_value;
@@ -122,9 +130,9 @@
             }
             else
             {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
-                    if((x < 0 || x >= input_w) && has_pads)
+                    if ((x < 0 || x >= input_w) && has_pads)
                     {
                         *(out_ptr + 0 * kernel_size2) = pad_value;
                         *(out_ptr + 1 * kernel_size2) = pad_value;
@@ -132,9 +140,12 @@
                     }
                     else
                     {
-                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
-                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>(
+                            in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x)));
                     }
                 }
             }
@@ -143,11 +154,11 @@
     }
 
     // Left over
-    for(; d < kernel_depth; d++)
+    for (; d < kernel_depth; d++)
     {
-        for(int y = top_left_y; y < y_e; y += dilation_y)
+        for (int y = top_left_y; y < y_e; y += dilation_y)
         {
-            if((y < 0 || y >= input_h) && has_pads)
+            if ((y < 0 || y >= input_h) && has_pads)
             {
                 // All the values will be the offset (will be zeros when not quantized)
                 memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T));
@@ -155,15 +166,16 @@
             }
             else
             {
-                for(int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
+                for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr)
                 {
-                    if((x < 0 || x >= input_w) && has_pads)
+                    if ((x < 0 || x >= input_w) && has_pads)
                     {
                         *out_ptr = pad_value;
                     }
                     else
                     {
-                        *out_ptr = *(reinterpret_cast<const T *>(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
+                        *out_ptr = *(reinterpret_cast<const T *>(
+                            in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x)));
                     }
                 }
             }
@@ -171,7 +183,7 @@
     }
 
     // Append 1 if the convolution layer has biases
-    if(has_bias)
+    if (has_bias)
     {
         *out_ptr = static_cast<T>(1);
     }
@@ -198,36 +210,39 @@
     const int end_y        = start_y + kernel_height * dilation_y;
     const int pad_quant    = kernel_width * input_c;
     const int element_size = static_cast<int>(sizeof(T));
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == input_c * element_size))
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == input_c * element_size))
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
             //optimized for no dilation and no boundary pixels
-            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
+            memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                   input_c * kernel_width * element_size);
             out_ptr += input_c * kernel_width;
         }
     }
     else
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
-            if(y < 0 || y >= input_h)
+            if (y < 0 || y >= input_h)
             {
                 memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
                 out_ptr += pad_quant;
             }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size)
             {
-                for(int x = start_x; x < end_x; x += dilation_x)
+                for (int x = start_x; x < end_x; x += dilation_x)
                 {
-                    if(x < 0 || x >= input_w)
+                    if (x < 0 || x >= input_w)
                     {
                         memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size);
                         out_ptr += input_c;
                     }
                     else
                     {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), input_c * element_size);
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               input_c * element_size);
                         out_ptr += input_c;
                     }
                 }
@@ -235,13 +250,14 @@
             else
             {
                 //optimized for no dilation and no boundary pixels
-                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), input_c * kernel_width * element_size);
+                memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)),
+                       input_c * kernel_width * element_size);
                 out_ptr += input_c * kernel_width;
             }
         }
     }
     // Append 1 if the convolution layer has biases
-    if(has_bias)
+    if (has_bias)
     {
         *out_ptr = static_cast<T>(1);
     }
@@ -271,12 +287,13 @@
     const int element_size       = static_cast<int>(sizeof(T));
     const int channel_chunk_size = input_c * element_size;
 
-    if((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && (input_stride_y == channel_chunk_size))
+    if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) &&
+        (input_stride_y == channel_chunk_size))
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
             const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
-            for(int e = 0; e < kernel_width; e++)
+            for (int e = 0; e < kernel_width; e++)
             {
                 memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
                 out_ptr += input_c + pad_right;
@@ -285,25 +302,26 @@
     }
     else
     {
-        for(int y = start_y; y < end_y; y += dilation_y)
+        for (int y = start_y; y < end_y; y += dilation_y)
         {
-            if(y < 0 || y >= input_h)
+            if (y < 0 || y >= input_h)
             {
                 memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size);
                 out_ptr += pad_quant;
             }
-            else if(dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
+            else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size)
             {
-                for(int x = start_x; x < end_x; x += dilation_x)
+                for (int x = start_x; x < end_x; x += dilation_x)
                 {
-                    if(x < 0 || x >= input_w)
+                    if (x < 0 || x >= input_w)
                     {
                         memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size);
                         out_ptr += input_c + pad_right;
                     }
                     else
                     {
-                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), channel_chunk_size);
+                        memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)),
+                               channel_chunk_size);
                         out_ptr += input_c + pad_right;
                     }
                 }
@@ -311,16 +329,17 @@
             else
             {
                 const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y);
-                for(int e = 0; e < kernel_width; e++)
+                for (int e = 0; e < kernel_width; e++)
                 {
-                    memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size);
+                    memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size),
+                           channel_chunk_size);
                     out_ptr += input_c + pad_right;
                 }
             }
         }
     }
     // Append 1 if the convolution layer has biases
-    if(has_bias)
+    if (has_bias)
     {
         *out_ptr = static_cast<T>(1);
     }
@@ -348,7 +367,8 @@
     const int pad_top        = _conv_info.pad_top();
     const int stride_x       = _conv_info.stride().first;
     const int stride_y       = _conv_info.stride().second;
-    const int pad_value      = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
+    const int pad_value =
+        is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
 
     Window window_in_out(window);
     // The first three dimensions of the input and output are increased by the inner loops
@@ -361,84 +381,57 @@
     Iterator out(dst, window_in_out);
 
     execute_window_loop(
-        window, [&](const Coordinates & id)
-    {
-        const int start_w = id[width_idx] * stride_x - pad_left;
-        const int start_h = id[height_idx] * stride_y - pad_top;
-
-        // Get pointers
-        const uint8_t *const input_ptr  = in.ptr();
-        auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y());
-
-        // Linearize volume
-        if(is_nchw)
+        window,
+        [&](const Coordinates &id)
         {
-            linearize_volume_nchw<T, has_pads>(input_ptr,
-                                               output_ptr,
-                                               _has_bias,
-                                               start_w,
-                                               start_h,
-                                               _kernel_width,
-                                               _kernel_height,
-                                               input_c,
-                                               input_w,
-                                               input_h,
-                                               input_stride_x,
-                                               input_stride_y,
-                                               input_stride_z,
-                                               pad_value,
-                                               _dilation.x(),
-                                               _dilation.y());
-        }
-        else
-        {
-            if(_input_pad_right > 0)
+            const int start_w = id[width_idx] * stride_x - pad_left;
+            const int start_h = id[height_idx] * stride_y - pad_top;
+
+            // Get pointers
+            const uint8_t *const input_ptr = in.ptr();
+            auto                 output_ptr =
+                reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) *
+                                                      dst->info()->strides_in_bytes().y());
+
+            // Linearize volume
+            if (is_nchw)
             {
-                linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                                   output_ptr,
-                                                   _has_bias,
-                                                   start_w,
-                                                   start_h,
-                                                   _kernel_width,
-                                                   _kernel_height,
-                                                   input_w,
-                                                   input_h,
-                                                   input_c,
-                                                   input_stride_y,
-                                                   input_stride_z,
-                                                   pad_value,
-                                                   _dilation.x(),
-                                                   _dilation.y(),
-                                                   _input_pad_right);
+                linearize_volume_nchw<T, has_pads>(
+                    input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_c, input_w,
+                    input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y());
             }
             else
             {
-                linearize_volume_nhwc<T, has_pads>(input_ptr,
-                                                   output_ptr,
-                                                   _has_bias,
-                                                   start_w,
-                                                   start_h,
-                                                   _kernel_width,
-                                                   _kernel_height,
-                                                   input_w,
-                                                   input_h,
-                                                   input_c,
-                                                   input_stride_y,
-                                                   input_stride_z,
-                                                   pad_value,
-                                                   _dilation.x(),
-                                                   _dilation.y());
+                if (_input_pad_right > 0)
+                {
+                    linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, _has_bias, start_w, start_h,
+                                                       _kernel_width, _kernel_height, input_w, input_h, input_c,
+                                                       input_stride_y, input_stride_z, pad_value, _dilation.x(),
+                                                       _dilation.y(), _input_pad_right);
+                }
+                else
+                {
+                    linearize_volume_nhwc<T, has_pads>(
+                        input_ptr, output_ptr, _has_bias, start_w, start_h, _kernel_width, _kernel_height, input_w,
+                        input_h, input_c, input_stride_y, input_stride_z, pad_value, _dilation.x(), _dilation.y());
+                }
             }
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
-void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+void CpuIm2ColKernel::configure(const ITensorInfo   *src,
+                                ITensorInfo         *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups,
+                                unsigned int         input_pad_right)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
     ARM_COMPUTE_UNUSED(num_groups);
 
     _data_layout                   = src->data_layout();
@@ -451,31 +444,34 @@
     _kernel_height   = kernel_dims.height;
     _input_pad_right = input_pad_right;
     _dilation        = dilation;
-    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx),
-                                         _kernel_width, _kernel_height,
-                                         _conv_info, _dilation);
+    _convolved_dims  = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx), _kernel_width,
+                                         _kernel_height, _conv_info, _dilation);
     _has_bias        = has_bias;
 
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<float, true, true>;
                 break;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
                 break;
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8_SIGNED:
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true>
+                                                   : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -484,26 +480,31 @@
     }
     else
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<float, true, false>;
                 break;
 #if defined(ARM_COMPUTE_ENABLE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
                 break;
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             case DataType::QASYMM8_SIGNED:
-                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false>
+                                                   : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -512,11 +513,13 @@
     }
 
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false, num_groups, _input_pad_right)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation,
+                                                                       false, num_groups, _input_pad_right)));
 
-    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx),
-                                                                             kernel_dims.width, kernel_dims.height,
-                                                                             conv_info, dilation);
+    std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx), kernel_dims.width, kernel_dims.height,
+                          conv_info, dilation);
 
     Window win = calculate_max_window(*src, Steps());
     win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
@@ -526,10 +529,17 @@
     ICpuKernel::configure(win);
 }
 
-Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                 bool has_bias, const Size2D &dilation, unsigned int num_groups, unsigned int input_pad_right)
+Status CpuIm2ColKernel::validate(const ITensorInfo   *src,
+                                 const ITensorInfo   *dst,
+                                 const Size2D        &kernel_dims,
+                                 const PadStrideInfo &conv_info,
+                                 bool                 has_bias,
+                                 const Size2D        &dilation,
+                                 unsigned int         num_groups,
+                                 unsigned int         input_pad_right)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups, input_pad_right));
     return Status{};
 }
 

diff --git a/src/cpu/kernels/CpuIm2ColKernel.h b/src/cpu/kernels/CpuIm2ColKernel.h
index d133f8d..2cb2617 100644
--- a/src/cpu/kernels/CpuIm2ColKernel.h
+++ b/src/cpu/kernels/CpuIm2ColKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_IM2COL_KERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -78,16 +79,28 @@
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      * @param[in]  input_pad_right (Optional) When fast-math is selected, per element padding for the im2col matrix may be necessary
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                   bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0);
+    void configure(const ITensorInfo   *src,
+                   ITensorInfo         *dst,
+                   const Size2D        &kernel_dims,
+                   const PadStrideInfo &conv_info,
+                   bool                 has_bias,
+                   const Size2D        &dilation        = Size2D(1U, 1U),
+                   unsigned int         num_groups      = 1,
+                   unsigned int         input_pad_right = 0);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuIm2ColKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                           bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1, unsigned int input_pad_right = 0);
+    static Status validate(const ITensorInfo   *src,
+                           const ITensorInfo   *dst,
+                           const Size2D        &kernel_dims,
+                           const PadStrideInfo &conv_info,
+                           bool                 has_bias,
+                           const Size2D        &dilation        = Size2D(1U, 1U),
+                           unsigned int         num_groups      = 1,
+                           unsigned int         input_pad_right = 0);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -117,15 +130,15 @@
      */
     using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
 
-    Im2ColFunctionPtr                     _func{ nullptr };
+    Im2ColFunctionPtr                     _func{nullptr};
     std::pair<unsigned int, unsigned int> _convolved_dims{};
     PadStrideInfo                         _conv_info{};
-    unsigned int                          _kernel_width{ 0 };
-    unsigned int                          _kernel_height{ 0 };
-    unsigned int                          _input_pad_right{ 0 };
-    bool                                  _has_bias{ false };
-    Size2D                                _dilation{ 1U, 1U };
-    DataLayout                            _data_layout{ DataLayout::UNKNOWN };
+    unsigned int                          _kernel_width{0};
+    unsigned int                          _kernel_height{0};
+    unsigned int                          _input_pad_right{0};
+    bool                                  _has_bias{false};
+    Size2D                                _dilation{1U, 1U};
+    DataLayout                            _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuKernelSelectionTypes.h b/src/cpu/kernels/CpuKernelSelectionTypes.h
index 39adc9a..b7daa4d 100644
--- a/src/cpu/kernels/CpuKernelSelectionTypes.h
+++ b/src/cpu/kernels/CpuKernelSelectionTypes.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_KERNEL_SELECTION_TYPES_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/common/cpuinfo/CpuIsaInfo.h"
 
 namespace arm_compute
@@ -78,10 +79,10 @@
 
 struct ActivationDataTypeISASelectorData
 {
-    DataType                                dt;
-    const CPUModel                         &cpumodel;
-    const cpuinfo::CpuIsaInfo              &isa;
-    const ActivationFunction f;
+    DataType                   dt;
+    const CPUModel            &cpumodel;
+    const cpuinfo::CpuIsaInfo &isa;
+    const ActivationFunction   f;
 };
 
 struct CpuAddKernelDataTypeISASelectorData
@@ -99,15 +100,19 @@
 };
 
 // Selector pointer types
-using DataTypeISASelectorPtr                      = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
-using DataTypeDataLayoutSelectorPtr               = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
-using PoolDataTypeISASelectorPtr                  = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
-using ElementwiseDataTypeISASelectorPtr           = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
-using DepthwiseConv2dNativeDataTypeISASelectorPtr = std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
-using CastDataTypeISASelectorDataPtr              = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type;
-using ActivationDataTypeISASelectorDataPtr        = std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type;
-using CpuAddKernelDataTypeISASelectorDataPtr      = std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
-using ScaleKernelDataTypeISASelectorDataPtr       = std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
+using DataTypeISASelectorPtr            = std::add_pointer<bool(const DataTypeISASelectorData &data)>::type;
+using DataTypeDataLayoutSelectorPtr     = std::add_pointer<bool(const DataTypeDataLayoutISASelectorData &data)>::type;
+using PoolDataTypeISASelectorPtr        = std::add_pointer<bool(const PoolDataTypeISASelectorData &data)>::type;
+using ElementwiseDataTypeISASelectorPtr = std::add_pointer<bool(const ElementwiseDataTypeISASelectorData &data)>::type;
+using DepthwiseConv2dNativeDataTypeISASelectorPtr =
+    std::add_pointer<bool(const DepthwiseConv2dNativeDataTypeISASelectorData &data)>::type;
+using CastDataTypeISASelectorDataPtr = std::add_pointer<bool(const CastDataTypeISASelectorData &data)>::type;
+using ActivationDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const ActivationDataTypeISASelectorData &data)>::type;
+using CpuAddKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const CpuAddKernelDataTypeISASelectorData &data)>::type;
+using ScaleKernelDataTypeISASelectorDataPtr =
+    std::add_pointer<bool(const ScaleKernelDataTypeISASelectorData &data)>::type;
 
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
index 7d077c7..bcaa76b 100644
--- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.cpp

@@ -24,11 +24,12 @@
 #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/maxunpool/list.h"
@@ -43,50 +44,43 @@
 
 namespace
 {
-static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels =
-{
-    {
-        "neon_fp32_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(neon_fp32_maxunpooling)
-    },
-    {
-        "neon_fp16_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(neon_fp16_maxunpooling)
-    },
-    {
-        "neon_qu8_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)
-    },
-    {
-        "neon_qs8_maxunpooling",
-        [](const DataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)
-    },
+static const std::vector<CpuMaxUnpoolingLayerKernel::MaxUnpoolingKernel> available_kernels = {
+    {"neon_fp32_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(neon_fp32_maxunpooling)},
+    {"neon_fp16_maxunpooling",
+     [](const DataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_maxunpooling)},
+    {"neon_qu8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(neon_qs8_maxunpooling)},
+    {"neon_qs8_maxunpooling", [](const DataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qu8_maxunpooling)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *indices,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, indices);
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    PoolingType         pool_type       = pool_info.pool_type;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    PoolingType         pool_type          = pool_info.pool_type;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int    pool_size_x = pool_info.pool_size.width;
-    const int    pool_size_y = pool_info.pool_size.height;
+    const int    pool_size_x               = pool_info.pool_size.width;
+    const int    pool_size_y               = pool_info.pool_size.height;
     const Size2D pool_size(pool_size_x, pool_size_y);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                    "Pooling indices only supported for MAX pooling method");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -96,13 +90,17 @@
 }
 } // namespace
 
-void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+void CpuMaxUnpoolingLayerKernel::configure(const ITensorInfo      *src,
+                                           const ITensorInfo      *indices,
+                                           ITensorInfo            *dst,
+                                           const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, indices);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, indices, dst, pool_info));
     ARM_COMPUTE_UNUSED(indices);
 
-    const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto uk = CpuMaxUnpoolingLayerKernel::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
     _run_method = uk->ukernel;
 
@@ -113,7 +111,10 @@
     ICpuKernel::configure(window);
 }
 
-Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status CpuMaxUnpoolingLayerKernel::validate(const ITensorInfo      *src,
+                                            const ITensorInfo      *indices,
+                                            const ITensorInfo      *dst,
+                                            const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, indices, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, indices, dst, pool_info));

diff --git a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
index d0c1347..5a641a2 100644
--- a/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h
+++ b/src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h

@@ -37,7 +37,8 @@
 class CpuMaxUnpoolingLayerKernel : public ICpuKernel<CpuMaxUnpoolingLayerKernel>
 {
 private:
-    using MaxUnpoolingUKernelPtr = std::add_pointer<void(const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type;
+    using MaxUnpoolingUKernelPtr = std::add_pointer<void(
+        const ITensor *input, const ITensor *indices, ITensor *output, const Window &window)>::type;
 
 public:
     /** Default constructor */
@@ -56,7 +57,8 @@
      * @param[out] dst       Destination tensor. Data types supported: Same as @p src
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuMaxUnpoolingLayerKernel
      *
      * @param[in]  src       Source tensor to permute. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -66,7 +68,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -83,7 +88,7 @@
     const char *name() const override;
 
 private:
-    MaxUnpoolingUKernelPtr _run_method{ nullptr };
+    MaxUnpoolingUKernelPtr _run_method{nullptr};
 };
 
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuMulKernel.cpp b/src/cpu/kernels/CpuMulKernel.cpp
index b73d2bd..ba086e3 100644
--- a/src/cpu/kernels/CpuMulKernel.cpp
+++ b/src/cpu/kernels/CpuMulKernel.cpp

@@ -25,23 +25,24 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 #include <arm_neon.h>
 
 namespace
 {
 #if defined(ENABLE_FP32_KERNELS)
-    static constexpr size_t default_mws_N1_fp32_neon = 22447;
-    static constexpr size_t default_mws_V1_fp32_neon = 38982;
+static constexpr size_t default_mws_N1_fp32_neon = 22447;
+static constexpr size_t default_mws_V1_fp32_neon = 38982;
 #endif /* ENABLE_FP32_KERNELS */
-    static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
-}
+static constexpr size_t default_mws_other_platforms_1d_tensor = 10240;
+} // namespace
 namespace arm_compute
 {
 namespace cpu
@@ -54,29 +55,38 @@
 const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant);
 const float32x4_t positive_round_f32q    = vdupq_n_f32(0.5f);
 
-inline Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+inline Status validate_arguments(const ITensorInfo *src1,
+                                 const ITensorInfo *src2,
+                                 const ITensorInfo *dst,
+                                 float              scale,
+                                 ConvertPolicy      overflow_policy,
+                                 RoundingPolicy     rounding_policy)
 {
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32, DataType::QSYMM16, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16,
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::S32,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
                                                          DataType::S32, DataType::F16, DataType::F32);
-    if(is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
+    if (is_data_type_quantized(src1->data_type()) || is_data_type_quantized(src2->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP, "ConvertPolicy cannot be WRAP if datatype is quantized");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(overflow_policy == ConvertPolicy::WRAP,
+                                        "ConvertPolicy cannot be WRAP if datatype is quantized");
     }
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
         // clang-format off
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(
@@ -88,13 +98,17 @@
             !(src1->data_type() == DataType::QSYMM16 && src2->data_type() == DataType::QSYMM16 && dst->data_type() == DataType::S32)
             , "Invalid data type combination");
         // clang-format on
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 && scale != 1.f, "Unsupported scale for QSYMM16 inputs and S32 dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S16 && dst->data_type() == DataType::S32 &&
+                                            scale != 1.f,
+                                        "Unsupported scale for QSYMM16 inputs and S32 dst");
     }
 
-    if(std::abs(scale - scale255_constant) < 0.00001f)
+    if (std::abs(scale - scale255_constant) < 0.00001f)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 && dst->data_type() == DataType::S32,
+        ARM_COMPUTE_RETURN_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP &&
+                                    rounding_policy != RoundingPolicy::TO_NEAREST_EVEN);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->data_type() == DataType::S32 && src2->data_type() == DataType::S32 &&
+                                            dst->data_type() == DataType::S32,
                                         "Scale == 1/255 is not supported if input and dst are of data type S32");
     }
     else
@@ -107,7 +121,8 @@
         // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
         // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
         // Moreover, it will be negative as we deal with 1/2^n
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)), "Scale value not supported (Should be 1/(2^n) or 1/255");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)),
+                                        "Scale value not supported (Should be 1/(2^n) or 1/255");
     }
 
     return Status{};
@@ -168,9 +183,9 @@
     const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
 
     const UniformQuantizationInfo output_qua_info = out->info()->quantization_info().uniform();
-    const UniformQuantizationInfo tmp_qua_info    = { output_qua_info.scale / scale, output_qua_info.offset };
+    const UniformQuantizationInfo tmp_qua_info    = {output_qua_info.scale / scale, output_qua_info.offset};
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
         Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -178,7 +193,7 @@
         const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src2 : src1;
         const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src2 : src1;
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -190,52 +205,52 @@
         using ExactTagType = typename wrapper::traits::neon_vector<T, window_step_x>::tag_type;
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(dst.ptr());
-
-            const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<T *>(dst.ptr());
 
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
+                const auto broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+                const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
 
-                const float32x4x4_t out_f32x4x4 =
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
 
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
+                    // Dequantize inputs
+                    const float32x4x4_t in1_f32x4x4 = vdequantize(non_broadcast_v, non_broadcast_qinfo);
+                    const float32x4x4_t in2_f32x4x4 = vdequantize(broadcast_value_vec, broadcast_qinfo);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(non_broadcast_input_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
-                const float tmp_f   = tmp_in1 * tmp_in2;
+                    const float32x4x4_t out_f32x4x4 = {
+                        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                    };
 
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                    // Quantize dst
+                    const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                    wrapper::vstore(output_ptr + x, result);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    // Dequantize inputs
+                    const T     src1    = *(non_broadcast_input_ptr + x);
+                    const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, non_broadcast_qinfo);
+                    const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(broadcast_value, broadcast_qinfo);
+                    const float tmp_f   = tmp_in1 * tmp_in2;
+
+                    // Quantize dst
+                    const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                    *(output_ptr + x)  = tmp_qua;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -251,56 +266,59 @@
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto input1_q = wrapper::vloadq(input1_ptr + x);
-                const auto input2_q = wrapper::vloadq(input2_ptr + x);
+                const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<T *>(dst.ptr());
 
-                // Dequantize inputs
-                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
-
-                const float32x4x4_t out_f32x4x4 =
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-                };
+                    const auto input1_q = wrapper::vloadq(input1_ptr + x);
+                    const auto input2_q = wrapper::vloadq(input2_ptr + x);
 
-                // Quantize dst
-                const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
-                wrapper::vstore(output_ptr + x, result);
-            }
+                    // Dequantize inputs
+                    const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+                    const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                // Dequantize inputs
-                const T     src1    = *(input1_ptr + x);
-                const T     src2    = *(input2_ptr + x);
-                const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
-                const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
-                const float tmp_f   = tmp_in1 * tmp_in2;
+                    const float32x4x4_t out_f32x4x4 = {
+                        vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                        vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                        vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                        vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                    };
 
-                // Quantize dst
-                const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
-                *(output_ptr + x)  = tmp_qua;
-            }
-        },
-        input1, input2, dst);
+                    // Quantize dst
+                    const auto result = vquantize<T>(out_f32x4x4, tmp_qua_info);
+                    wrapper::vstore(output_ptr + x, result);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    // Dequantize inputs
+                    const T     src1    = *(input1_ptr + x);
+                    const T     src2    = *(input2_ptr + x);
+                    const float tmp_in1 = Qasymm8QuantizationHelper<T>::dequantize(src1, input1_qua_info);
+                    const float tmp_in2 = Qasymm8QuantizationHelper<T>::dequantize(src2, input2_qua_info);
+                    const float tmp_f   = tmp_in1 * tmp_in2;
+
+                    // Quantize dst
+                    const auto tmp_qua = Qasymm8QuantizationHelper<T>::quantize(tmp_f, tmp_qua_info);
+                    *(output_ptr + x)  = tmp_qua;
+                }
+            },
+            input1, input2, dst);
     }
 }
 
-bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, float scale)
+bool mul_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                     const ITensorInfo *src1,
+                                     const ITensorInfo *dst,
+                                     float              scale)
 {
     const auto iq0 = src0->quantization_info().uniform();
     const auto iq1 = src1->quantization_info().uniform();
@@ -308,7 +326,7 @@
 
     const auto multiplier = ((iq0.scale * iq1.scale) / oq.scale) * scale;
 
-    if(multiplier < -8191.f || multiplier > 8191.f)
+    if (multiplier < -8191.f || multiplier > 8191.f)
     {
         //The multiplier cannot be stored as a 14.18 signed fixed-point number
         return false;
@@ -318,7 +336,7 @@
 
     const auto max_result = multiplier * (256) * (256) + offset_out;
 
-    if(max_result > 8191.f)
+    if (max_result > 8191.f)
     {
         //It might not be possible to store the result as a 14.18 signed fixed-point number.
         return false;
@@ -366,7 +384,7 @@
     const auto out_offset_14p18 = static_cast<int32_t>(out_offset * two_pwr18i);
     const auto multiplier_14p18 = static_cast<int32_t>(multiplier * two_pwr18f);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Prefix: a = non-broadcast, b = broadcast.
 
@@ -392,78 +410,76 @@
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
-            const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
-            const auto b_val            = *b_ptr;
-            const auto b_offseted_32p0  = static_cast<int32_t>(b_val - b_offset_16p0);
-            const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
-
-            const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
-            const auto voffsetout_14p18  = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
-
-            int x = window_start_x;
-
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                // Load the inputs.
-                const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
-
-                // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
-                const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
-                const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
-
-                const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
-                const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
-                const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
-                const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
-
-                const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
-                const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
-                const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
-                const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
-
-                const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
-                const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
-                const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
-                const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
-
-                // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
-                const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
-                const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
-                const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
-                const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
-
-                const auto vout_15p1_0 = wrapper::vcombine(
-                                             vout_15p1_00,
-                                             vout_15p1_01);
-
-                const auto vout_15p1_1 = wrapper::vcombine(
-                                             vout_15p1_10,
-                                             vout_15p1_11);
+                const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+                const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
                 const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
 
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<2>(vout_15p1_0),
-                                          wrapper::vqrshrn<2>(vout_15p1_1));
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
+                const auto b_val            = *b_ptr;
+                const auto b_offseted_32p0  = static_cast<int32_t>(b_val - b_offset_16p0);
+                const auto b_voffseted_32p0 = wrapper::vdup_n(b_offseted_32p0, wrapper::traits::vector_128_tag());
 
-            //Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                const auto vmultiplier_14p18 = wrapper::vdup_n(multiplier_14p18, wrapper::traits::vector_128_tag());
+                const auto voffsetout_14p18  = wrapper::vdup_n(out_offset_14p18, wrapper::traits::vector_128_tag());
+
+                int x = window_start_x;
+
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+
+                    // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+                    const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+                    const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+
+                    const auto voffseted_32p0_00 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_0), a_voffset_16p0);
+                    const auto voffseted_32p0_01 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_0), a_voffset_16p0);
+                    const auto voffseted_32p0_10 = wrapper::vsubl(wrapper::vgetlow(a_vin_16p0_1), a_voffset_16p0);
+                    const auto voffseted_32p0_11 = wrapper::vsubl(wrapper::vgethigh(a_vin_16p0_1), a_voffset_16p0);
+
+                    const auto vinnermul_32p0_00 = wrapper::vmul(voffseted_32p0_00, b_voffseted_32p0);
+                    const auto vinnermul_32p0_01 = wrapper::vmul(voffseted_32p0_01, b_voffseted_32p0);
+                    const auto vinnermul_32p0_10 = wrapper::vmul(voffseted_32p0_10, b_voffseted_32p0);
+                    const auto vinnermul_32p0_11 = wrapper::vmul(voffseted_32p0_11, b_voffseted_32p0);
+
+                    const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+                    const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+                    const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+                    const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+
+                    // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+                    const auto vout_15p1_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+                    const auto vout_15p1_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+                    const auto vout_15p1_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+                    const auto vout_15p1_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+
+                    const auto vout_15p1_0 = wrapper::vcombine(vout_15p1_00, vout_15p1_01);
+
+                    const auto vout_15p1_1 = wrapper::vcombine(vout_15p1_10, vout_15p1_11);
+                    const auto out_ptr     = reinterpret_cast<ScalarType *>(out_it.ptr());
+
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<2>(vout_15p1_0), wrapper::vqrshrn<2>(vout_15p1_1));
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                //Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(
-                                                                                                             b_val) - b_offset_16p0)) + out_offset_14p18)));
+                    out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(
+                        (multiplier_14p18 * (int32_t(a_ptr[x]) - a_offset_16p0) * (int32_t(b_val) - b_offset_16p0)) +
+                        out_offset_14p18)));
 #else  //__aarch64__
-                out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
+                    out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+                        multiplier * ((float(a_ptr[x]) - a_offset) * (float(b_val) - b_offset)) + float(out_offset)));
 #endif //__aarch64__
-            }
-        },
-        a_input_it, b_input_it, out_it);
+                }
+            },
+            a_input_it, b_input_it, out_it);
     }
     else
     {
@@ -481,82 +497,83 @@
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
-            const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
-            int x = window_start_x;
-
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                // Load the inputs.
-                const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
-                const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+                const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+                const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
 
-                // Widen the input elements to signed 16-bit regardless of the input signedness.
-                const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
-                const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
-                const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
-                const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+                int x = window_start_x;
 
-                const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
-                const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
-                const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
-                const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+                    const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
 
-                const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
-                const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
-                const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
-                const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
+                    // Widen the input elements to signed 16-bit regardless of the input signedness.
+                    const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+                    const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+                    const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+                    const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
 
-                const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
-                const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
-                const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
-                const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
+                    const auto voffseted0_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_0), voffset0_16p0);
+                    const auto voffseted0_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_0), voffset0_16p0);
+                    const auto voffseted0_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin0_16p0_1), voffset0_16p0);
+                    const auto voffseted0_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin0_16p0_1), voffset0_16p0);
 
-                const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
-                const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
-                const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
-                const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
+                    const auto voffseted1_32p0_00 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_0), voffset1_16p0);
+                    const auto voffseted1_32p0_01 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_0), voffset1_16p0);
+                    const auto voffseted1_32p0_10 = wrapper::vsubl(wrapper::vgetlow(vin1_16p0_1), voffset1_16p0);
+                    const auto voffseted1_32p0_11 = wrapper::vsubl(wrapper::vgethigh(vin1_16p0_1), voffset1_16p0);
 
-                // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
-                const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
-                const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
-                const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
-                const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
+                    const auto vinnermul_32p0_00 = wrapper::vmul(voffseted0_32p0_00, voffseted1_32p0_00);
+                    const auto vinnermul_32p0_01 = wrapper::vmul(voffseted0_32p0_01, voffseted1_32p0_01);
+                    const auto vinnermul_32p0_10 = wrapper::vmul(voffseted0_32p0_10, voffseted1_32p0_10);
+                    const auto vinnermul_32p0_11 = wrapper::vmul(voffseted0_32p0_11, voffseted1_32p0_11);
 
-                const auto vout_14p2_0 = wrapper::vcombine(
-                                             vout_14p2_00,
-                                             vout_14p2_01);
+                    const auto vout_14p18_00 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_00, vmultiplier_14p18);
+                    const auto vout_14p18_01 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_01, vmultiplier_14p18);
+                    const auto vout_14p18_10 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_10, vmultiplier_14p18);
+                    const auto vout_14p18_11 = wrapper::vmla(voffsetout_14p18, vinnermul_32p0_11, vmultiplier_14p18);
 
-                const auto vout_14p2_1 = wrapper::vcombine(
-                                             vout_14p2_10,
-                                             vout_14p2_11);
+                    // These shift rights are to revert the multiplication by twopwr18. Hard limit of a maximum shift by 8 requires multiple shift instructions to achieve this.
+                    const auto vout_14p2_00 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_00));
+                    const auto vout_14p2_01 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_01));
+                    const auto vout_14p2_10 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_10));
+                    const auto vout_14p2_11 = wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>(vout_14p18_11));
 
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<2>(vout_14p2_0),
-                                          wrapper::vqrshrn<2>(vout_14p2_1));
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
+                    const auto vout_14p2_0 = wrapper::vcombine(vout_14p2_00, vout_14p2_01);
 
-            //Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                    const auto vout_14p2_1 = wrapper::vcombine(vout_14p2_10, vout_14p2_11);
+
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<2>(vout_14p2_0), wrapper::vqrshrn<2>(vout_14p2_1));
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                //Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) * (int32_t(
-                                                                                                             in1_ptr[x]) - in1_offset_16p0)) + out_offset_14p18)));
+                    out_ptr[x] = wrapper::vqrshrn<2>(wrapper::vqrshrn_ex<8, ScalarType>(
+                        wrapper::vshrq_n<8>((multiplier_14p18 * (int32_t(in0_ptr[x]) - in0_offset_16p0) *
+                                             (int32_t(in1_ptr[x]) - in1_offset_16p0)) +
+                                            out_offset_14p18)));
 #else  //__aarch64__
-                out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) + float(out_offset)));
+                    out_ptr[x] = utility::clamp<int32_t, ScalarType>(support::cpp11::lround(
+                        multiplier * ((float(in0_ptr[x]) - in0_offset) * (float(in1_ptr[x]) - in1_offset)) +
+                        float(out_offset)));
 #endif //__aarch64__
-            }
-        },
-        in0_it, in1_it, out_it);
+                }
+            },
+            in0_it, in1_it, out_it);
     }
 }
 
-void mul_saturate_QSYMM16_QSYMM16_QSYMM16(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
+void mul_saturate_QSYMM16_QSYMM16_QSYMM16(
+    const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, float scale)
 {
     const UniformQuantizationInfo input1_qua_info = src1->info()->quantization_info().uniform();
     const UniformQuantizationInfo input2_qua_info = src2->info()->quantization_info().uniform();
@@ -580,66 +597,61 @@
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    const UniformQuantizationInfo tmp_qua_info = { output_qua_info.scale / scale, output_qua_info.offset };
+    const UniformQuantizationInfo tmp_qua_info = {output_qua_info.scale / scale, output_qua_info.offset};
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const qsymm16x8x2_t input1_q =
+            const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<qsymm16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
+                const qsymm16x8x2_t input1_q = {{
                     vld1q_s16(input1_ptr + x),
                     vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
+                }};
+                const qsymm16x8x2_t input2_q = {{
                     vld1q_s16(input2_ptr + x),
                     vld1q_s16(input2_ptr + x + 8),
-                }
-            };
+                }};
 
-            // Dequantize inputs
-            const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
-            const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
+                // Dequantize inputs
+                const float32x4x4_t in1_f32x4x4 = vdequantize(input1_q, input1_qua_info);
+                const float32x4x4_t in2_f32x4x4 = vdequantize(input2_q, input2_qua_info);
 
-            const float32x4x4_t out_f32x4x4 =
+                const float32x4x4_t out_f32x4x4 = {
+                    vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
+                    vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
+                    vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
+                    vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
+                };
+
+                const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                vmulq_f32(in1_f32x4x4.val[0], in2_f32x4x4.val[0]),
-                vmulq_f32(in1_f32x4x4.val[1], in2_f32x4x4.val[1]),
-                vmulq_f32(in1_f32x4x4.val[2], in2_f32x4x4.val[2]),
-                vmulq_f32(in1_f32x4x4.val[3], in2_f32x4x4.val[3]),
-            };
+                // Dequantize inputs
+                float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
+                float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
+                float tmp_f   = tmp_in1 * tmp_in2;
 
-            const qsymm16x8x2_t result = vquantize_qsymm16(out_f32x4x4, tmp_qua_info);
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            // Dequantize inputs
-            float tmp_in1 = static_cast<float>(*(input1_ptr + x)) * input1_qua_info.scale;
-            float tmp_in2 = static_cast<float>(*(input2_ptr + x)) * input2_qua_info.scale;
-            float tmp_f   = tmp_in1 * tmp_in2;
-
-            // Quantize dst, lrintf() has same rounding mode as vcombine_s16
-            int32_t   tmp     = lrintf(tmp_f / tmp_qua_info.scale);
-            qsymm16_t tmp_qua = static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            *(output_ptr + x) = tmp_qua;
-        }
-    },
-    input1, input2, dst);
+                // Quantize dst, lrintf() has same rounding mode as vcombine_s16
+                int32_t   tmp = lrintf(tmp_f / tmp_qua_info.scale);
+                qsymm16_t tmp_qua =
+                    static_cast<qsymm16_t>(tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                *(output_ptr + x) = tmp_qua;
+            }
+        },
+        input1, input2, dst);
 }
 
 void mul_QSYMM16_QSYMM16_S32(const ITensor *src1, const ITensor *src2, ITensor *out, const Window &window, int scale)
@@ -665,74 +677,60 @@
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const qsymm16x8x2_t input1_q =
+            const auto input1_ptr = reinterpret_cast<const qsymm16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const qsymm16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
+                const qsymm16x8x2_t input1_q = {{
                     vld1q_s16(input1_ptr + x),
                     vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const qsymm16x8x2_t input2_q =
-            {
-                {
+                }};
+                const qsymm16x8x2_t input2_q = {{
                     vld1q_s16(input2_ptr + x),
                     vld1q_s16(input2_ptr + x + 8),
-                }
-            };
+                }};
 
-            const int32x4x4_t in1_s32 =
-            {
-                {
+                const int32x4x4_t in1_s32 = {{
                     vmovl_s16(vget_low_s16(input1_q.val[0])),
                     vmovl_s16(vget_high_s16(input1_q.val[0])),
                     vmovl_s16(vget_low_s16(input1_q.val[1])),
                     vmovl_s16(vget_high_s16(input1_q.val[1])),
-                }
-            };
-            const int32x4x4_t in2_s32 =
-            {
-                {
+                }};
+                const int32x4x4_t in2_s32 = {{
                     vmovl_s16(vget_low_s16(input2_q.val[0])),
                     vmovl_s16(vget_high_s16(input2_q.val[0])),
                     vmovl_s16(vget_low_s16(input2_q.val[1])),
                     vmovl_s16(vget_high_s16(input2_q.val[1])),
-                }
-            };
+                }};
 
-            const int32x4x4_t result =
-            {
-                {
+                const int32x4x4_t result = {{
                     vmulq_s32(in1_s32.val[0], in2_s32.val[0]),
                     vmulq_s32(in1_s32.val[1], in2_s32.val[1]),
                     vmulq_s32(in1_s32.val[2], in2_s32.val[2]),
                     vmulq_s32(in1_s32.val[3], in2_s32.val[3]),
-                }
-            };
+                }};
 
-            vst1q_s32(output_ptr + x, result.val[0]);
-            vst1q_s32(output_ptr + x + 4, result.val[1]);
-            vst1q_s32(output_ptr + x + 8, result.val[2]);
-            vst1q_s32(output_ptr + x + 12, result.val[3]);
-        }
+                vst1q_s32(output_ptr + x, result.val[0]);
+                vst1q_s32(output_ptr + x + 4, result.val[1]);
+                vst1q_s32(output_ptr + x + 8, result.val[2]);
+                vst1q_s32(output_ptr + x + 12, result.val[3]);
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input1, input2, dst);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                int32_t tmp       = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+                *(output_ptr + x) = tmp;
+            }
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -757,79 +755,80 @@
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
-            const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
 
-            uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
-            const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
-            uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
-            const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
-
-            tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
-            tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
-
-            if(is_scale255)
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                tmp1_high = scale255_U16_U16(tmp1_high);
-                tmp1_low  = scale255_U16_U16(tmp1_low);
-            }
-            else
-            {
-                const int16x8_t vn = vdupq_n_s16(-n);
+                const uint8x16_t ta1 = wrapper::vloadq(input1_ptr + x);
+                const uint8x16_t ta2 = wrapper::vloadq(input2_ptr + x);
 
-                if(is_sat)
+                uint16x8_t       tmp1_high = vmovl_u8(vget_high_u8(ta1));
+                const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2));
+                uint16x8_t       tmp1_low  = vmovl_u8(vget_low_u8(ta1));
+                const uint16x8_t tmp2_low  = vmovl_u8(vget_low_u8(ta2));
+
+                tmp1_high = vmulq_u16(tmp1_high, tmp2_high);
+                tmp1_low  = vmulq_u16(tmp1_low, tmp2_low);
+
+                if (is_scale255)
                 {
-                    tmp1_high = vqshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vqshlq_u16(tmp1_low, vn);
+                    tmp1_high = scale255_U16_U16(tmp1_high);
+                    tmp1_low  = scale255_U16_U16(tmp1_low);
                 }
                 else
                 {
-                    tmp1_high = vshlq_u16(tmp1_high, vn);
-                    tmp1_low  = vshlq_u16(tmp1_low, vn);
+                    const int16x8_t vn = vdupq_n_s16(-n);
+
+                    if (is_sat)
+                    {
+                        tmp1_high = vqshlq_u16(tmp1_high, vn);
+                        tmp1_low  = vqshlq_u16(tmp1_low, vn);
+                    }
+                    else
+                    {
+                        tmp1_high = vshlq_u16(tmp1_high, vn);
+                        tmp1_low  = vshlq_u16(tmp1_low, vn);
+                    }
+                }
+                if (is_sat)
+                {
+                    vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
+                }
+                else
+                {
+                    vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
                 }
             }
-            if(is_sat)
-            {
-                vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high)));
-            }
-            else
-            {
-                vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high)));
-            }
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                uint16_t tmp = static_cast<uint16_t>(*(input1_ptr + x)) * static_cast<uint16_t>(*(input2_ptr + x));
 
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                    tmp         = static_cast<uint16_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    tmp >>= n;
+                }
+                if (is_sat && tmp > 255)
+                {
+                    tmp = 255;
+                }
+                *(output_ptr + x) = static_cast<uint8_t>(tmp);
             }
-            else
-            {
-                tmp >>= n;
-            }
-            if(is_sat && tmp > 255)
-            {
-                tmp = 255;
-            }
-            *(output_ptr + x) = static_cast<uint8_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -843,7 +842,7 @@
     tmp1_high = vmulq_s32(tmp1_high, tmp2_high);
     tmp1_low  = vmulq_s32(tmp1_low, tmp2_low);
 
-    if(is_scale255)
+    if (is_scale255)
     {
         tmp1_high = scale255_S32_S32(tmp1_high);
         tmp1_low  = scale255_S32_S32(tmp1_low);
@@ -863,7 +862,7 @@
         const int32x4_t  sign_low_s   = vreinterpretq_s32_u32(sign_low);
         const int32x4_t  convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s);
         const int32x4_t  convert_low  = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s);
-        if(is_sat)
+        if (is_sat)
         {
             tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn);
             tmp1_low  = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn);
@@ -875,7 +874,7 @@
         }
     }
 
-    if(is_sat)
+    if (is_sat)
     {
         return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high));
     }
@@ -888,15 +887,10 @@
 template <bool is_scale255, bool is_sat>
 inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &src1, const int16x8x2_t &src2, int n)
 {
-    const int16x8x2_t result =
-    {
-        {
-            // First 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
-            // Second 8 elements
-            mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
+    const int16x8x2_t result = {{// First 8 elements
+                                 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[0], src2.val[0], n),
+                                 // Second 8 elements
+                                 mul_S16_S16_S16_n_loop<is_scale255, is_sat>(src1.val[1], src2.val[1], n)}};
 
     return result;
 }
@@ -923,67 +917,62 @@
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const int16x8x2_t ta1 =
+            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vld1q_s16(input2_ptr + x),
-                    vld1q_s16(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+                const int16x8x2_t ta1    = {{
+                       vld1q_s16(input1_ptr + x),
+                       vld1q_s16(input1_ptr + x + 8),
+                }};
+                const int16x8x2_t ta2    = {{
+                       vld1q_s16(input2_ptr + x),
+                       vld1q_s16(input2_ptr + x + 8),
+                }};
+                const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
 
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
             }
-            else
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                if(tmp >= 0)
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+
+                if (is_scale255)
                 {
-                    tmp >>= n;
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                    tmp = static_cast<int32_t>(tmp_f + 0.5f);
                 }
                 else
                 {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint32_t mask = (1u << n) - 1;
+                        tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    }
                 }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                }
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
             }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_sat>
@@ -1012,7 +1001,7 @@
     const uint64x2_t sign_2    = vshrq_n_u64(tmp_2_u, 63);
     const int64x2_t  sign_2_s  = vreinterpretq_s64_u64(sign_2);
     const int64x2_t  convert_2 = vsubq_s64(vshlq_s64(sign_2_s, vnl), sign_2_s);
-    if(is_sat)
+    if (is_sat)
     {
         tmp_1 = vqshlq_s64(vaddq_s64(tmp_1, convert_1), vn);
         tmp_2 = vqshlq_s64(vaddq_s64(tmp_2, convert_2), vn);
@@ -1029,15 +1018,10 @@
 template <bool is_sat>
 inline int32x4x2_t mul_S32_S32_S32_n_k(const int32x4x2_t &src1, const int32x4x2_t &src2, int n)
 {
-    const int32x4x2_t result =
-    {
-        {
-            // First 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
-            // Second 4 elements
-            mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)
-        }
-    };
+    const int32x4x2_t result = {{// First 4 elements
+                                 mul_S32_S32_S32_n_loop<is_sat>(src1.val[0], src2.val[0], n),
+                                 // Second 4 elements
+                                 mul_S32_S32_S32_n_loop<is_sat>(src1.val[1], src2.val[1], n)}};
 
     return result;
 }
@@ -1058,7 +1042,7 @@
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1074,60 +1058,56 @@
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int32_t *>(dst.ptr());
-
-            const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int32x4x2_t broadcast_v =
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int32_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int32_t *>(dst.ptr());
+
+                const int32_t broadcast_value     = *reinterpret_cast<const int32_t *>(broadcast_input.ptr());
+                const auto    broadcast_value_vec = vdupq_n_s32(broadcast_value);
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        broadcast_value_vec,
-                        broadcast_value_vec,
-                    }
-                };
-                const int32x4x2_t non_broadcast_v =
-                {
-                    {
+                    const int32x4x2_t broadcast_v     = {{
+                            broadcast_value_vec,
+                            broadcast_value_vec,
+                    }};
+                    const int32x4x2_t non_broadcast_v = {{
                         vld1q_s32(non_broadcast_input_ptr + x),
                         vld1q_s32(non_broadcast_input_ptr + x + 4),
+                    }};
+                    const int32x4x2_t result          = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
+
+                    vst1q_s32(output_ptr + x, result.val[0]);
+                    vst1q_s32(output_ptr + x + 4, result.val[1]);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    int64_t tmp =
+                        static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
+
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
                     }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(broadcast_v, non_broadcast_v, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(broadcast_value) * static_cast<int64_t>(*(non_broadcast_input_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
+                    else
+                    {
+                        uint64_t mask = ((uint64_t)1u << n) - 1;
+                        tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
+                    }
+                    if (is_sat)
+                    {
+                        tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    }
+                    *(output_ptr + x) = static_cast<int32_t>(tmp);
                 }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
-                }
-                if(is_sat)
-                {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
-                }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1140,58 +1120,53 @@
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int32x4x2_t ta1 =
+                const auto input1_ptr = reinterpret_cast<const int32_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int32_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
+                    const int32x4x2_t ta1    = {{
+                           vld1q_s32(input1_ptr + x),
+                           vld1q_s32(input1_ptr + x + 4),
+                    }};
+                    const int32x4x2_t ta2    = {{
+                           vld1q_s32(input2_ptr + x),
+                           vld1q_s32(input2_ptr + x + 4),
+                    }};
+                    const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
+
+                    vst1q_s32(output_ptr + x, result.val[0]);
+                    vst1q_s32(output_ptr + x + 4, result.val[1]);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
+
+                    if (tmp >= 0)
                     {
-                        vld1q_s32(input1_ptr + x),
-                        vld1q_s32(input1_ptr + x + 4),
+                        tmp >>= n;
                     }
-                };
-                const int32x4x2_t ta2 =
-                {
+                    else
                     {
-                        vld1q_s32(input2_ptr + x),
-                        vld1q_s32(input2_ptr + x + 4),
+                        uint64_t mask = ((uint64_t)1u << n) - 1;
+                        tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
                     }
-                };
-                const int32x4x2_t result = mul_S32_S32_S32_n_k<is_sat>(ta1, ta2, n);
-
-                vst1q_s32(output_ptr + x, result.val[0]);
-                vst1q_s32(output_ptr + x + 4, result.val[1]);
-            }
-
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                int64_t tmp = static_cast<int64_t>(*(input1_ptr + x)) * static_cast<int64_t>(*(input2_ptr + x));
-
-                if(tmp >= 0)
-                {
-                    tmp >>= n;
+                    if (is_sat)
+                    {
+                        tmp = utility::clamp<int64_t, int32_t>(tmp);
+                    }
+                    *(output_ptr + x) = static_cast<int32_t>(tmp);
                 }
-                else
-                {
-                    uint64_t mask = ((uint64_t)1u << n) - 1;
-                    tmp           = (tmp + static_cast<int64_t>(mask)) >> n;
-                }
-                if(is_sat)
-                {
-                    tmp = utility::clamp<int64_t, int32_t>(tmp);
-                }
-                *(output_ptr + x) = static_cast<int32_t>(tmp);
-            }
-        },
-        input1, input2, dst);
+            },
+            input1, input2, dst);
     }
 }
 
@@ -1212,7 +1187,7 @@
 
     using ExactTagType = typename wrapper::traits::neon_vector<float, window_step_x>::tag_type;
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1228,32 +1203,33 @@
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
-
-            const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
-            const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-            const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                const float broadcast_value     = *reinterpret_cast<const float *>(broadcast_input.ptr());
+                const auto  broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+                const auto  scale_vec           = wrapper::vdup_n(scale, ExactTagType{});
+
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    auto       res = wrapper::vmul(wrapper::vmul(broadcast_value_vec, non_broadcast_v), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1266,32 +1242,33 @@
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto ta1       = wrapper::vloadq(input1_ptr + x);
-                const auto ta2       = wrapper::vloadq(input2_ptr + x);
-                const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
-                const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
-                wrapper::vstore(output_ptr + x, res);
-            }
+                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto ta1       = wrapper::vloadq(input1_ptr + x);
+                    const auto ta2       = wrapper::vloadq(input2_ptr + x);
+                    const auto scale_vec = wrapper::vdup_n(scale, ExactTagType{});
+                    const auto res       = wrapper::vmul(wrapper::vmul(ta1, ta2), scale_vec);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
     }
 }
 
@@ -1312,7 +1289,7 @@
 
     using ExactTagType = typename wrapper::traits::neon_vector<float, 2>::tag_type;
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1328,48 +1305,49 @@
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
-
-            const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
-                float32x4_t b = vdupq_n_f32(broadcast_value);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float *>(dst.ptr());
 
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+                const float broadcast_value = *reinterpret_cast<const float *>(broadcast_input.ptr());
 
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto  a = wrapper::vloadq(non_broadcast_input_ptr + 2 * x);
+                    float32x4_t b = vdupq_n_f32(broadcast_value);
 
-                float32x4_t res = wrapper::vmul(tmp0, b);
-                b               = wrapper::vmul(b, mask);
+                    const float32x4_t mask  = {-1.0f, 1.0f, -1.0f, 1.0f};
+                    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
 
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
+                    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
-                const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
-                auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
-                auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
-                *(output_ptr + 2 * x)           = res1;
-                *(output_ptr + 2 * x + 1)       = res2;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                    float32x4_t res = wrapper::vmul(tmp0, b);
+                    b               = wrapper::vmul(b, mask);
+
+                    res = wrapper::vmla(res, tmp1, b);
+                    wrapper::vstore(output_ptr + 2 * x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_value0 = *(non_broadcast_input_ptr + 2 * x);
+                    const auto non_broadcast_value1 = *(non_broadcast_input_ptr + 2 * x + 1);
+                    auto       res1                 = broadcast_value * (non_broadcast_value0 - non_broadcast_value1);
+                    auto       res2                 = broadcast_value * (non_broadcast_value1 + non_broadcast_value0);
+                    *(output_ptr + 2 * x)           = res1;
+                    *(output_ptr + 2 * x + 1)       = res2;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1382,51 +1360,52 @@
         Iterator dst(out, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
-
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
-                float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
+                const auto input1_ptr = reinterpret_cast<const float *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float *>(dst.ptr());
 
-                const float32x4_t mask  = { -1.0f, 1.0f, -1.0f, 1.0f };
-                const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
-                const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
-                const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
-                const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const float32x4_t a = wrapper::vloadq(input1_ptr + 2 * x);
+                    float32x4_t       b = wrapper::vloadq(input2_ptr + 2 * x);
 
-                const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
-                const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
+                    const float32x4_t mask  = {-1.0f, 1.0f, -1.0f, 1.0f};
+                    const float32x2_t tmp00 = wrapper::vdup_n(wrapper::vgetlane(a, 0), ExactTagType{});
+                    const float32x2_t tmp01 = wrapper::vdup_n(wrapper::vgetlane(a, 1), ExactTagType{});
+                    const float32x2_t tmp10 = wrapper::vdup_n(wrapper::vgetlane(a, 2), ExactTagType{});
+                    const float32x2_t tmp11 = wrapper::vdup_n(wrapper::vgetlane(a, 3), ExactTagType{});
 
-                float32x4_t res = wrapper::vmul(tmp0, b);
+                    const float32x4_t tmp0 = wrapper::vcombine(tmp00, tmp10);
+                    const float32x4_t tmp1 = wrapper::vcombine(tmp01, tmp11);
 
-                b = wrapper::vrev64(b);
-                b = wrapper::vmul(b, mask);
+                    float32x4_t res = wrapper::vmul(tmp0, b);
 
-                res = wrapper::vmla(res, tmp1, b);
-                wrapper::vstore(output_ptr + 2 * x, res);
-            }
+                    b = wrapper::vrev64(b);
+                    b = wrapper::vmul(b, mask);
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto a0             = *(input1_ptr + 2 * x);
-                const auto a1             = *(input1_ptr + 2 * x + 1);
-                const auto b0             = *(input2_ptr + 2 * x);
-                const auto b1             = *(input2_ptr + 2 * x + 1);
-                auto       res1           = a0 * b0 - a1 * b1;
-                auto       res2           = a0 * b1 + a1 * b0;
-                *(output_ptr + 2 * x)     = res1;
-                *(output_ptr + 2 * x + 1) = res2;
-            }
-        },
-        input1, input2, dst);
+                    res = wrapper::vmla(res, tmp1, b);
+                    wrapper::vstore(output_ptr + 2 * x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a0             = *(input1_ptr + 2 * x);
+                    const auto a1             = *(input1_ptr + 2 * x + 1);
+                    const auto b0             = *(input2_ptr + 2 * x);
+                    const auto b1             = *(input2_ptr + 2 * x + 1);
+                    auto       res1           = a0 * b0 - a1 * b1;
+                    auto       res2           = a0 * b1 + a1 * b0;
+                    *(output_ptr + 2 * x)     = res1;
+                    *(output_ptr + 2 * x + 1) = res2;
+                }
+            },
+            input1, input2, dst);
     }
 }
 
@@ -1444,7 +1423,7 @@
     const auto    window_start_x        = static_cast<int>(window.x().start());
     const auto    window_end_x          = static_cast<int>(window.x().end());
     const bool    is_broadcast_across_x = src1->info()->tensor_shape().x() != src2->info()->tensor_shape().x();
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -1457,48 +1436,40 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator dst(out, win);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto          non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
-            const auto          output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
-            const auto          broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
-            const float16x8x2_t broadcast_value_vec     =
+            win,
+            [&](const Coordinates &)
             {
-                {
+                const auto non_broadcast_input_ptr = reinterpret_cast<const float16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<float16_t *>(dst.ptr());
+                const auto broadcast_value         = *reinterpret_cast<const float16_t *>(broadcast_input.ptr());
+                const float16x8x2_t broadcast_value_vec = {{
                     vdupq_n_f16(broadcast_value),
                     vdupq_n_f16(broadcast_value),
-                }
-            };
-            const auto scale_vec = vdupq_n_f16(scale);
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const float16x8x2_t non_broadcast_v =
+                }};
+                const auto          scale_vec           = vdupq_n_f16(scale);
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const float16x8x2_t non_broadcast_v = {{
                         vld1q_f16(non_broadcast_input_ptr + x),
                         vld1q_f16(non_broadcast_input_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t result =
+                    }};
+                    const float16x8x2_t result          = {{
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
+                                 vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[0], non_broadcast_v.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(broadcast_value_vec.val[1], non_broadcast_v.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
-            }
-        },
-        broadcast_input, non_broadcast_input, dst);
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = broadcast_value * non_broadcast_v * scale;
+                }
+            },
+            broadcast_input, non_broadcast_input, dst);
     }
     else
     {
@@ -1508,49 +1479,41 @@
         Iterator input2(src2, input2_win);
         Iterator dst(out, win);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-            // Compute window_step_x elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const float16x8x2_t ta1 =
+                const auto input1_ptr = reinterpret_cast<const float16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const float16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+                // Compute window_step_x elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
-                        vld1q_f16(input1_ptr + x),
-                        vld1q_f16(input1_ptr + x + 8),
-                    }
-                };
-                const float16x8x2_t ta2 =
+                    const float16x8x2_t ta1       = {{
+                              vld1q_f16(input1_ptr + x),
+                              vld1q_f16(input1_ptr + x + 8),
+                    }};
+                    const float16x8x2_t ta2       = {{
+                              vld1q_f16(input2_ptr + x),
+                              vld1q_f16(input2_ptr + x + 8),
+                    }};
+                    const float16x8_t   scale_vec = vdupq_n_f16(scale);
+                    const float16x8x2_t result    = {{
+                           vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
+                           vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
+                    }};
+                    vst1q_f16(output_ptr + x, result.val[0]);
+                    vst1q_f16(output_ptr + x + 8, result.val[1]);
+                }
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
                 {
-                    {
-                        vld1q_f16(input2_ptr + x),
-                        vld1q_f16(input2_ptr + x + 8),
-                    }
-                };
-                const float16x8_t   scale_vec = vdupq_n_f16(scale);
-                const float16x8x2_t result    =
-                {
-                    {
-                        vmulq_f16(vmulq_f16(ta1.val[0], ta2.val[0]), scale_vec),
-                        vmulq_f16(vmulq_f16(ta1.val[1], ta2.val[1]), scale_vec),
-                    }
-                };
-                vst1q_f16(output_ptr + x, result.val[0]);
-                vst1q_f16(output_ptr + x + 8, result.val[1]);
-            }
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto ta1    = *(input1_ptr + x);
-                const auto ta2    = *(input2_ptr + x);
-                *(output_ptr + x) = ta1 * ta2 * scale;
-            }
-        },
-        input1, input2, dst);
+                    const auto ta1    = *(input1_ptr + x);
+                    const auto ta2    = *(input2_ptr + x);
+                    *(output_ptr + x) = ta1 * ta2 * scale;
+                }
+            },
+            input1, input2, dst);
     }
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
@@ -1577,81 +1540,82 @@
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
-            const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
+            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
 
-            uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
-            uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
-            tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
-            tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
-
-            if(is_scale255)
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                tmp_low  = scale255_U16_U16(tmp_low);
-                tmp_high = scale255_U16_U16(tmp_high);
-            }
-            else
-            {
-                const int16x8_t vn = vdupq_n_s16(-n);
+                const uint8x16_t bv = wrapper::vloadq(input2_ptr + x);
+                const uint8x16_t av = wrapper::vloadq(input1_ptr + x);
 
-                if(is_sat)
+                uint16x8_t tmp_low  = vmovl_u8(vget_low_u8(av));
+                uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av));
+                tmp_low             = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv)));
+                tmp_high            = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv)));
+
+                if (is_scale255)
                 {
-                    tmp_low  = vqshlq_u16(tmp_low, vn);
-                    tmp_high = vqshlq_u16(tmp_high, vn);
+                    tmp_low  = scale255_U16_U16(tmp_low);
+                    tmp_high = scale255_U16_U16(tmp_high);
                 }
                 else
                 {
-                    tmp_low  = vshlq_u16(tmp_low, vn);
-                    tmp_high = vshlq_u16(tmp_high, vn);
+                    const int16x8_t vn = vdupq_n_s16(-n);
+
+                    if (is_sat)
+                    {
+                        tmp_low  = vqshlq_u16(tmp_low, vn);
+                        tmp_high = vqshlq_u16(tmp_high, vn);
+                    }
+                    else
+                    {
+                        tmp_low  = vshlq_u16(tmp_low, vn);
+                        tmp_high = vshlq_u16(tmp_high, vn);
+                    }
                 }
+
+                if (is_sat)
+                {
+                    static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
+
+                    tmp_low  = vminq_u16(tmp_low, max);
+                    tmp_high = vminq_u16(tmp_high, max);
+                }
+
+                vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
+                vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
             }
 
-            if(is_sat)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                static const uint16x8_t max = vdupq_n_u16(SHRT_MAX);
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
 
-                tmp_low  = vminq_u16(tmp_low, max);
-                tmp_high = vminq_u16(tmp_high, max);
+                if (is_scale255)
+                {
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+                    tmp         = static_cast<int32_t>(tmp_f + 0.5f);
+                }
+                else
+                {
+                    tmp >>= n;
+                }
+
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
+                }
+
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
             }
-
-            vst1q_s16(output_ptr + x, vreinterpretq_s16_u16(tmp_low));
-            vst1q_s16(output_ptr + x + 8, vreinterpretq_s16_u16(tmp_high));
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-                tmp         = static_cast<int32_t>(tmp_f + 0.5f);
-            }
-            else
-            {
-                tmp >>= n;
-            }
-
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : tmp;
-            }
-
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -1676,75 +1640,65 @@
     const auto window_end_x   = static_cast<int>(window.x().end());
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-        const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
-
-        // Compute window_step_x elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const int16x8x2_t ta1 =
+            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(dst.ptr());
+
+            // Compute window_step_x elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vld1q_s16(input1_ptr + x),
-                    vld1q_s16(input1_ptr + x + 8),
-                }
-            };
-            const uint8x8x2_t ta2u =
-            {
-                {
+                const int16x8x2_t ta1  = {{
+                     vld1q_s16(input1_ptr + x),
+                     vld1q_s16(input1_ptr + x + 8),
+                }};
+                const uint8x8x2_t ta2u = {{
                     vld1_u8(input2_ptr + x),
                     vld1_u8(input2_ptr + x + 8),
-                }
-            };
-            const int16x8x2_t ta2 =
-            {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])),
-                    vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))
-                }
-            };
+                }};
+                const int16x8x2_t ta2  = {
+                     {vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1]))}};
 
-            const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
+                const int16x8x2_t result = mul_S16_S16_S16_n_k<is_scale255, is_sat>(ta1, ta2, n);
 
-            vst1q_s16(output_ptr + x, result.val[0]);
-            vst1q_s16(output_ptr + x + 8, result.val[1]);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
-
-            if(is_scale255)
-            {
-                float tmp_f = static_cast<float>(tmp) * scale255_constant;
-
-                tmp = static_cast<int32_t>(tmp_f + 0.5f);
+                vst1q_s16(output_ptr + x, result.val[0]);
+                vst1q_s16(output_ptr + x + 8, result.val[1]);
             }
-            else
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                if(tmp >= 0)
+                int32_t tmp = static_cast<int32_t>(*(input1_ptr + x)) * static_cast<int32_t>(*(input2_ptr + x));
+
+                if (is_scale255)
                 {
-                    tmp >>= n;
+                    float tmp_f = static_cast<float>(tmp) * scale255_constant;
+
+                    tmp = static_cast<int32_t>(tmp_f + 0.5f);
                 }
                 else
                 {
-                    uint32_t mask = (1u << n) - 1;
-                    tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    if (tmp >= 0)
+                    {
+                        tmp >>= n;
+                    }
+                    else
+                    {
+                        uint32_t mask = (1u << n) - 1;
+                        tmp           = (tmp + static_cast<int32_t>(mask)) >> n;
+                    }
                 }
+                if (is_sat)
+                {
+                    tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
+                }
+                *(output_ptr + x) = static_cast<int16_t>(tmp);
             }
-            if(is_sat)
-            {
-                tmp = (tmp > SHRT_MAX) ? SHRT_MAX : ((tmp < SHRT_MIN) ? SHRT_MIN : tmp);
-            }
-            *(output_ptr + x) = static_cast<int16_t>(tmp);
-        }
-    },
-    input1, input2, dst);
+        },
+        input1, input2, dst);
 }
 
 template <bool is_scale255, bool is_sat>
@@ -1755,7 +1709,12 @@
 }
 } // namespace
 
-void CpuMulKernel::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy)
+void CpuMulKernel::configure(ITensorInfo   *src1,
+                             ITensorInfo   *src2,
+                             ITensorInfo   *dst,
+                             float          scale,
+                             ConvertPolicy  overflow_policy,
+                             RoundingPolicy rounding_policy)
 {
     ARM_COMPUTE_UNUSED(rounding_policy);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
@@ -1775,7 +1734,7 @@
 
     bool is_scale_255 = false;
     // Check and validate scaling factor
-    if(std::abs(scale - scale255_constant) < 0.00001f)
+    if (std::abs(scale - scale255_constant) < 0.00001f)
     {
         is_scale_255 = true;
     }
@@ -1795,12 +1754,12 @@
     const DataType dt_output = dst->data_type();
     const bool     is_sat    = (overflow_policy == ConvertPolicy::SATURATE);
 
-    switch(dt_input1)
+    switch (dt_input1)
     {
         case DataType::QASYMM8:
-            if(dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
+            if (dt_input2 == DataType::QASYMM8 && dt_output == DataType::QASYMM8)
             {
-                if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+                if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
                 {
                     _func_quantized = &mul_q8_neon_fixedpoint<uint8_t>;
                 }
@@ -1811,9 +1770,9 @@
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            if(dt_input2 == DataType::QASYMM8_SIGNED)
+            if (dt_input2 == DataType::QASYMM8_SIGNED)
             {
-                if(mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
+                if (mul_q8_neon_fixedpoint_possible(src1, src2, dst, scale))
                 {
                     _func_quantized = &mul_q8_neon_fixedpoint<int8_t>;
                 }
@@ -1824,19 +1783,19 @@
             }
             break;
         case DataType::QSYMM16:
-            if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
+            if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::QSYMM16)
             {
                 _func_quantized = &mul_saturate_QSYMM16_QSYMM16_QSYMM16;
             }
-            else if(dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
+            else if (dt_input2 == DataType::QSYMM16 && dt_output == DataType::S32)
             {
                 _func_int = &mul_QSYMM16_QSYMM16_S32;
             }
             break;
         case DataType::S16:
-            if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_S16_U8_S16<true, true> : &mul_S16_U8_S16<true, false>;
                 }
@@ -1845,9 +1804,9 @@
                     _func_int = is_sat ? &mul_S16_U8_S16<false, true> : &mul_S16_U8_S16<false, false>;
                 }
             }
-            if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_S16_S16_S16<true, true> : &mul_S16_S16_S16<true, false>;
                 }
@@ -1858,15 +1817,15 @@
             }
             break;
         case DataType::S32:
-            if(DataType::S32 == dt_input2 && DataType::S32 == dt_output)
+            if (DataType::S32 == dt_input2 && DataType::S32 == dt_output)
             {
                 _func_int = is_sat ? &mul_S32_S32_S32<true> : &mul_S32_S32_S32<false>;
             }
             break;
         case DataType::U8:
-            if(DataType::U8 == dt_input2 && DataType::U8 == dt_output)
+            if (DataType::U8 == dt_input2 && DataType::U8 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_U8_U8_U8<true, true> : &mul_U8_U8_U8<true, false>;
                 }
@@ -1875,9 +1834,9 @@
                     _func_int = is_sat ? &mul_U8_U8_U8<false, true> : &mul_U8_U8_U8<false, false>;
                 }
             }
-            else if(DataType::U8 == dt_input2 && DataType::S16 == dt_output)
+            else if (DataType::U8 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_U8_U8_S16<true, true> : &mul_U8_U8_S16<true, false>;
                 }
@@ -1886,9 +1845,9 @@
                     _func_int = is_sat ? &mul_U8_U8_S16<false, true> : &mul_U8_U8_S16<false, false>;
                 }
             }
-            else if(DataType::S16 == dt_input2 && DataType::S16 == dt_output)
+            else if (DataType::S16 == dt_input2 && DataType::S16 == dt_output)
             {
-                if(is_scale_255)
+                if (is_scale_255)
                 {
                     _func_int = is_sat ? &mul_U8_S16_S16<true, true> : &mul_U8_S16_S16<true, false>;
                 }
@@ -1922,20 +1881,20 @@
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_func_float == &mul_F32_F32_F32)
+    if (this->_func_float == &mul_F32_F32_F32)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_mws_V1_fp32_neon;
         }
         else
         {
-            if(_split_dimension == Window::DimX)
+            if (_split_dimension == Window::DimX)
             {
                 // Don't split the work load too small if the tensor has been reinterpreted as 1D.
                 // This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -1945,7 +1904,7 @@
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -1958,10 +1917,10 @@
             return std::max(static_cast<size_t>(1), mws);
         }
     }
-#else /* ENABLE_FP32_KERNELS */
+#else  /* ENABLE_FP32_KERNELS */
     ARM_COMPUTE_UNUSED(platform);
 #endif /* ENABLE_FP32_KERNELS */
-    if(_split_dimension == Window::DimX)
+    if (_split_dimension == Window::DimX)
     {
         // Don't split the work load too small if the tensor has been reinterpreted as 1D.
         // This number is loosely chosen as threading overhead in each platform varies wildly.
@@ -1970,8 +1929,12 @@
     return default_mws;
 }
 
-Status CpuMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy,
-                              RoundingPolicy rounding_policy)
+Status CpuMulKernel::validate(const ITensorInfo *src1,
+                              const ITensorInfo *src2,
+                              const ITensorInfo *dst,
+                              float              scale,
+                              ConvertPolicy      overflow_policy,
+                              RoundingPolicy     rounding_policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy));
@@ -1989,11 +1952,11 @@
     auto src2 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_func_quantized != nullptr)
+    if (_func_quantized != nullptr)
     {
         (*_func_quantized)(src1, src2, dst, window, _scale);
     }
-    else if(_func_int != nullptr)
+    else if (_func_int != nullptr)
     {
         (*_func_int)(src1, src2, dst, window, _scale_exponent);
     }
@@ -2021,10 +1984,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
     }
 
     return Status{};

diff --git a/src/cpu/kernels/CpuMulKernel.h b/src/cpu/kernels/CpuMulKernel.h
index 9e4a371..7eaf287 100644
--- a/src/cpu/kernels/CpuMulKernel.h
+++ b/src/cpu/kernels/CpuMulKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_MUL_KERNEL_H
 
 #include "arm_compute/core/Rounding.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -68,17 +69,27 @@
      * @param[in]  overflow_policy Overflow policy. ConvertPolicy cannot be WRAP if any of the inputs is of quantized datatype
      * @param[in]  rounding_policy Rounding policy.
      */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    void configure(ITensorInfo   *src1,
+                   ITensorInfo   *src2,
+                   ITensorInfo   *dst,
+                   float          scale,
+                   ConvertPolicy  overflow_policy,
+                   RoundingPolicy rounding_policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy);
+    static Status validate(const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           float              scale,
+                           ConvertPolicy      overflow_policy,
+                           RoundingPolicy     rounding_policy);
 
     // Inherited methods overridden
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Return minimum workload size of the relevant kernel
@@ -108,7 +119,8 @@
      * @param[in]  window Region on which to execute the kernel
      * @param[in]  scale  Integer scale factor.
      */
-    using MulFunctionInt = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
+    using MulFunctionInt =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, int scale);
     /** Common signature for all the specialised multiplication functions with float scaling factor
      *
      * @param[in]  src1   Src1 tensor object.
@@ -117,7 +129,8 @@
      * @param[in]  window Region on which to execute the kernel
      * @param[in]  scale  Float scale factor.
      */
-    using MulFunctionFloat = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+    using MulFunctionFloat =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
     /** Common signature for all the specialised QASYMM8 multiplication functions with float scaling factor
      *
      * @param[in]  src1   Src1 tensor object.
@@ -127,14 +140,15 @@
      * @param[in]  scale  Float scale factor.
      *
      */
-    using MulFunctionQuantized = void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
+    using MulFunctionQuantized =
+        void(const ITensor *src1, const ITensor *src2, ITensor *dst, const Window &window, float scale);
 
-    MulFunctionFloat     *_func_float{ nullptr };
-    MulFunctionInt       *_func_int{ nullptr };
-    MulFunctionQuantized *_func_quantized{ nullptr };
-    float                 _scale{ 0 };
-    int                   _scale_exponent{ 0 };
-    size_t                _split_dimension{ Window::DimY };
+    MulFunctionFloat     *_func_float{nullptr};
+    MulFunctionInt       *_func_int{nullptr};
+    MulFunctionQuantized *_func_quantized{nullptr};
+    float                 _scale{0};
+    int                   _scale_exponent{0};
+    size_t                _split_dimension{Window::DimY};
 };
 
 /** Interface for the complex pixelwise multiplication kernel. */
@@ -159,7 +173,7 @@
     static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuPermuteKernel.cpp b/src/cpu/kernels/CpuPermuteKernel.cpp
index d65e011..b444a25 100644
--- a/src/cpu/kernels/CpuPermuteKernel.cpp
+++ b/src/cpu/kernels/CpuPermuteKernel.cpp

@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -48,56 +49,31 @@
 {
 inline bool is_permutation_supported(const PermutationVector &v)
 {
-    static const std::array<PermutationVector, 2> permutations2 =
-    {
-        {
-            PermutationVector(0U, 1U),
-            PermutationVector(1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 6> permutations3 =
-    {
-        {
-            PermutationVector(2U, 0U, 1U),
-            PermutationVector(1U, 2U, 0U),
-            PermutationVector(0U, 1U, 2U),
-            PermutationVector(0U, 2U, 1U),
-            PermutationVector(1U, 0U, 2U),
-            PermutationVector(2U, 1U, 0U),
-        }
-    };
-    static const std::array<PermutationVector, 24> permutations4 =
-    {
-        {
-            PermutationVector(0U, 1U, 2U, 3U),
-            PermutationVector(1U, 0U, 2U, 3U),
-            PermutationVector(2U, 0U, 1U, 3U),
-            PermutationVector(0U, 2U, 1U, 3U),
-            PermutationVector(1U, 2U, 0U, 3U),
-            PermutationVector(2U, 1U, 0U, 3U),
-            PermutationVector(2U, 1U, 3U, 0U),
-            PermutationVector(1U, 2U, 3U, 0U),
-            PermutationVector(3U, 2U, 1U, 0U),
-            PermutationVector(2U, 3U, 1U, 0U),
-            PermutationVector(1U, 3U, 2U, 0U),
-            PermutationVector(3U, 1U, 2U, 0U),
-            PermutationVector(3U, 0U, 2U, 1U),
-            PermutationVector(0U, 3U, 2U, 1U),
-            PermutationVector(2U, 3U, 0U, 1U),
-            PermutationVector(3U, 2U, 0U, 1U),
-            PermutationVector(0U, 2U, 3U, 1U),
-            PermutationVector(2U, 0U, 3U, 1U),
-            PermutationVector(1U, 0U, 3U, 2U),
-            PermutationVector(0U, 1U, 3U, 2U),
-            PermutationVector(3U, 1U, 0U, 2U),
-            PermutationVector(1U, 3U, 0U, 2U),
-            PermutationVector(0U, 3U, 1U, 2U),
-            PermutationVector(3U, 0U, 1U, 2U)
-        }
-    };
+    static const std::array<PermutationVector, 2>  permutations2 = {{
+         PermutationVector(0U, 1U),
+         PermutationVector(1U, 0U),
+    }};
+    static const std::array<PermutationVector, 6>  permutations3 = {{
+         PermutationVector(2U, 0U, 1U),
+         PermutationVector(1U, 2U, 0U),
+         PermutationVector(0U, 1U, 2U),
+         PermutationVector(0U, 2U, 1U),
+         PermutationVector(1U, 0U, 2U),
+         PermutationVector(2U, 1U, 0U),
+    }};
+    static const std::array<PermutationVector, 24> permutations4 = {
+        {PermutationVector(0U, 1U, 2U, 3U), PermutationVector(1U, 0U, 2U, 3U), PermutationVector(2U, 0U, 1U, 3U),
+         PermutationVector(0U, 2U, 1U, 3U), PermutationVector(1U, 2U, 0U, 3U), PermutationVector(2U, 1U, 0U, 3U),
+         PermutationVector(2U, 1U, 3U, 0U), PermutationVector(1U, 2U, 3U, 0U), PermutationVector(3U, 2U, 1U, 0U),
+         PermutationVector(2U, 3U, 1U, 0U), PermutationVector(1U, 3U, 2U, 0U), PermutationVector(3U, 1U, 2U, 0U),
+         PermutationVector(3U, 0U, 2U, 1U), PermutationVector(0U, 3U, 2U, 1U), PermutationVector(2U, 3U, 0U, 1U),
+         PermutationVector(3U, 2U, 0U, 1U), PermutationVector(0U, 2U, 3U, 1U), PermutationVector(2U, 0U, 3U, 1U),
+         PermutationVector(1U, 0U, 3U, 2U), PermutationVector(0U, 1U, 3U, 2U), PermutationVector(3U, 1U, 0U, 2U),
+         PermutationVector(1U, 3U, 0U, 2U), PermutationVector(0U, 3U, 1U, 2U), PermutationVector(3U, 0U, 1U, 2U)}};
 
-    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) || (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v))
-           || (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
+    return (permutations2.end() != std::find(permutations2.begin(), permutations2.end(), v)) ||
+           (permutations3.end() != std::find(permutations3.begin(), permutations3.end(), v)) ||
+           (permutations4.end() != std::find(permutations4.begin(), permutations4.end(), v));
 }
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm)
@@ -108,7 +84,7 @@
     const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
 
     // Validate configured destination
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -128,18 +104,22 @@
 
     // we only support these two configs in src/core/NEON/kernels/convolution/common/shims.hpp, for all others
     // we have to fall back to C++
-    if((src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U }) || (src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U }))
+    if ((src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U}) ||
+        (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U}))
     {
-        window_src.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
-        window_src.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
+        window_src.set(Window::DimX,
+                       Window::Dimension(window.x().start(), window.x().end(), window.x().end() - window.x().start()));
+        window_src.set(Window::DimY,
+                       Window::Dimension(window.y().start(), window.y().end(), window.y().end() - window.y().start()));
+        window_src.set(Window::DimZ,
+                       Window::Dimension(window.z().start(), window.z().end(), window.z().end() - window.z().start()));
         window_src.set(3, Window::Dimension(window[3].start(), window[3].end(), window[3].end() - window[3].start()));
     }
 
     // Destination window
     Window                  window_dst(window);
     const Window::Dimension zero_window = Window::Dimension(0, 0, 0);
-    for(size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
+    for (size_t d = 0; d <= dst->info()->num_dimensions(); ++d)
     {
         window_dst.set(d, zero_window);
     }
@@ -157,7 +137,7 @@
     int n_channels        = 0;
     int n_batches         = 0;
 
-    switch(src_layout)
+    switch (src_layout)
     {
         case DataLayout::NCHW:
         {
@@ -189,38 +169,42 @@
     }
 
     // CHW -> HWC
-    if(src_layout == DataLayout::NCHW && perm == PermutationVector{ 2U, 0U, 1U })
+    if (src_layout == DataLayout::NCHW && perm == PermutationVector{2U, 0U, 1U})
     {
         const int out_channel_stride = dst->info()->strides_in_bytes().x() / sizeof(T);
         const int out_col_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
         const int out_row_stride     = dst->info()->strides_in_bytes().z() / sizeof(T);
         const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
-            reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_channels, n_rows, n_cols,
-                                  in_batch_stride, in_channel_stride, in_row_stride,
-                                  out_batch_stride, out_row_stride, out_col_stride);
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window_src,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * out_col_stride + id[1] * out_row_stride + id[2] * out_channel_stride;
+                reorder::nchw_to_nhwc(reinterpret_cast<const T *>(src_it.ptr()),
+                                      reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_channels, n_rows, n_cols,
+                                      in_batch_stride, in_channel_stride, in_row_stride, out_batch_stride,
+                                      out_row_stride, out_col_stride);
+            },
+            src_it, dst_it);
     }
     // HWC -> CHW
-    else if(src_layout == DataLayout::NHWC && perm == PermutationVector{ 1U, 2U, 0U })
+    else if (src_layout == DataLayout::NHWC && perm == PermutationVector{1U, 2U, 0U})
     {
         const int out_col_stride     = dst->info()->strides_in_bytes().x() / sizeof(T);
         const int out_row_stride     = dst->info()->strides_in_bytes().y() / sizeof(T);
         const int out_channel_stride = dst->info()->strides_in_bytes().z() / sizeof(T);
         const int out_batch_stride   = dst->info()->strides_in_bytes()[3] / sizeof(T);
-        execute_window_loop(window_src, [&](const Coordinates & id)
-        {
-            const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
-            reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()), reinterpret_cast<T *>(dst_it.ptr()) + idx,
-                                  n_batches, n_rows, n_cols, n_channels,
-                                  in_batch_stride, in_row_stride, in_col_stride,
-                                  out_batch_stride, out_channel_stride, out_row_stride);
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window_src,
+            [&](const Coordinates &id)
+            {
+                const int idx = id[0] * out_channel_stride + id[1] * out_col_stride + id[2] * out_row_stride;
+                reorder::nhwc_to_nchw(reinterpret_cast<const T *>(src_it.ptr()),
+                                      reinterpret_cast<T *>(dst_it.ptr()) + idx, n_batches, n_rows, n_cols, n_channels,
+                                      in_batch_stride, in_row_stride, in_col_stride, out_batch_stride,
+                                      out_channel_stride, out_row_stride);
+            },
+            src_it, dst_it);
     }
     else
     {
@@ -230,12 +214,15 @@
         Strides perm_strides = strides;
         permute_strides(perm_strides, perm);
         const int perm_stride_3 = src->info()->num_dimensions() >= 4 ? perm_strides[3] : 0;
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int idx                                = id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
-            *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
-        },
-        src_it, dst_it);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int idx =
+                    id[0] * perm_strides[0] + id[1] * perm_strides[1] + id[2] * perm_strides[2] + id[3] * perm_stride_3;
+                *(reinterpret_cast<T *>(dst_it.ptr() + idx)) = *(reinterpret_cast<const T *>(src_it.ptr()));
+            },
+            src_it, dst_it);
     }
 }
 } // namespace
@@ -275,7 +262,7 @@
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(src->info()->element_size())
+    switch (src->info()->element_size())
     {
         case 1:
             run_permute<uint8_t>(window, src, dst, _perm);

diff --git a/src/cpu/kernels/CpuPermuteKernel.h b/src/cpu/kernels/CpuPermuteKernel.h
index 9e1b933..0cb2faf 100644
--- a/src/cpu/kernels/CpuPermuteKernel.h
+++ b/src/cpu/kernels/CpuPermuteKernel.h

@@ -57,7 +57,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:

diff --git a/src/cpu/kernels/CpuPool2dKernel.cpp b/src/cpu/kernels/CpuPool2dKernel.cpp
index d72a41c..9308d86 100644
--- a/src/cpu/kernels/CpuPool2dKernel.cpp
+++ b/src/cpu/kernels/CpuPool2dKernel.cpp

@@ -25,15 +25,17 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/cpu/kernels/pool2d/neon/list.h"
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/pool2d/neon/list.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -46,99 +48,111 @@
 {
 using namespace misc::shape_calculator;
 
-static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels =
-{
-    {
-        "neon_qu8_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)
-    },
-    {
-        "neon_qs8_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)
-    },
-    {
-        "neon_f16_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)
-    },
-    {
-        "neon_fp32_nhwc_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)
-    },
+static const std::vector<CpuPool2dKernel::PoolingKernel> available_kernels = {
+    {"neon_qu8_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8)); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_qasymm8_neon_nhwc)},
+    {"neon_qs8_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::QASYMM8_SIGNED)); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_qasymm8_signed_neon_nhwc)},
+    {"neon_f16_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F16)) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nhwc)},
+    {"neon_fp32_nhwc_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NHWC) && (data.dt == DataType::F32)); },
+     REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nhwc)},
 #if defined(ENABLE_NCHW_KERNELS)
-    {
-        "neon_qu8_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qu8_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qu8_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)
-    },
-    {
-        "neon_qs8_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_qs8_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_qs8_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)
-    },
-    {
-        "neon_fp16_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)
-    },
-    {
-        "neon_fp16_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)
-    },
-    {
-        "neon_fp16_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
-        REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool2",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool3",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_pool7",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) && (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)
-    },
-    {
-        "neon_fp32_nchw_poolMxN",
-        [](const PoolDataTypeISASelectorData & data) { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
-        REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)
-    },
+    {"neon_qu8_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<uint8_t>)},
+    {"neon_qu8_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<uint8_t>)},
+    {"neon_qu8_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8)); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<uint8_t>)},
+    {"neon_qs8_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling2_quantized_neon_nchw<int8_t>)},
+    {"neon_qs8_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3) && (data.pool_stride_x < 3));
+     },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::pooling3_quantized_neon_nchw<int8_t>)},
+    {"neon_qs8_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::QASYMM8_SIGNED)); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::poolingMxN_quantized_neon_nchw<int8_t>)},
+    {"neon_fp16_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::pooling2_fp16_neon_nchw)},
+    {"neon_fp16_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+     },
+     REGISTER_FP16_NEON(arm_compute::cpu::pooling3_fp16_neon_nchw)},
+    {"neon_fp16_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F16 && data.isa.fp16)); },
+     REGISTER_FP16_NEON(arm_compute::cpu::poolingMxN_fp16_neon_nchw)},
+    {"neon_fp32_nchw_pool2",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 2));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling2_fp32_neon_nchw)},
+    {"neon_fp32_nchw_pool3",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 3));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling3_fp32_neon_nchw)},
+    {"neon_fp32_nchw_pool7",
+     [](const PoolDataTypeISASelectorData &data)
+     {
+         return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32) &&
+                 (data.pool_size.x() == data.pool_size.y()) && (data.pool_size.x() == 7));
+     },
+     REGISTER_FP32_NEON(arm_compute::cpu::pooling7_fp32_neon_nchw)},
+    {"neon_fp32_nchw_poolMxN",
+     [](const PoolDataTypeISASelectorData &data)
+     { return ((data.dl == DataLayout::NCHW) && (data.dt == DataType::F32)); },
+     REGISTER_FP32_NEON(arm_compute::cpu::poolingMxN_fp32_neon_nchw)},
 #endif /* defined(ENABLE_NCHW_KERNELS) */
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info,
-                          const ITensorInfo *indices, Size2D pool_size)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices,
+                          Size2D                  pool_size)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(pool_size.x() == 0);
@@ -150,65 +164,78 @@
     int                 output_height   = 0;
     PoolingType         pool_type       = pool_info.pool_type;
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const auto          data_layout     = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int           idx_width       = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type()))
-                                    && (is_pool_region_entirely_outside_input(pool_info)),
-                                    "Pooling region that is entirely outside input tensor is unsupported for non-float types");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (!is_data_type_float(src->data_type())) && (is_pool_region_entirely_outside_input(pool_info)),
+        "Pooling region that is entirely outside input tensor is unsupported for non-float types");
 
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size.x(), pool_size.y(), pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+    std::tie(output_width, output_height) =
+        scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size.x(),
+                                 pool_size.y(), pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+                                    "Calculated output dimension size is invalid");
 
     TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type()));
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    if(indices)
+    if (indices)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32, DataType::F16);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indices, 1, DataType::U32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_type != PoolingType::MAX,
+                                        "Pooling indices only supported for MAX pooling method");
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(pool_type == PoolingType::L2 && is_data_type_quantized(src->data_type()));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding()
-                                    && (src->data_layout() == DataLayout::NHWC),
-                                    "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_data_type_quantized(src->data_type()) && !pool_info.exclude_padding &&
+            (pool_info.pool_type == PoolingType::AVG) && pool_info.pad_stride_info.has_padding() &&
+            (src->data_layout() == DataLayout::NHWC),
+        "exclude_padding equal false is not supported for AVG Pooling with padding on quantized types");
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
-        if(indices)
+        if (indices)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices), "Pooling indices returning source tensor coordinates is only supported for pool size 2x2");
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC), "Pooling kernel indices only supported for NHWC");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                ((pool_size != Size2D(2, 2)) && !pool_info.use_kernel_indices),
+                "Pooling indices returning source tensor coordinates is only supported for pool size 2x2");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.use_kernel_indices && (src->data_layout() != DataLayout::NHWC),
+                                            "Pooling kernel indices only supported for NHWC");
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &out_info);
         }
     }
 
-    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa() });
+    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{
+        src->data_type(), src->data_layout(), pool_stride_x, pool_size, CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, const PoolingLayerInfo &pool_info,
-                                                        unsigned int &num_elems_processed_per_iteration,
-                                                        int pool_size_x, int pool_size_y)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo            *src,
+                                                        ITensorInfo            *dst,
+                                                        ITensorInfo            *indices,
+                                                        const PoolingLayerInfo &pool_info,
+                                                        unsigned int           &num_elems_processed_per_iteration,
+                                                        int                     pool_size_x,
+                                                        int                     pool_size_y)
 {
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)));
-    if(indices)
+    if (indices)
     {
         // Indices auto inizialitation if not yet initialized
-        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src,
-                                                                                        pool_info)))
-                           .set_data_type(DataType::U32) /* we store the offset to the element */);
+        auto_init_if_empty(*indices, (src->clone()->set_tensor_shape(compute_pool_shape(*src, pool_info)))
+                                         .set_data_type(DataType::U32) /* we store the offset to the element */);
     }
     const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
 
@@ -219,20 +246,20 @@
     const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
 
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const bool         is_square = pool_size_x == pool_size_y;
-    const unsigned int pooled_w  = dst->dimension(idx_width);
-    const unsigned int pooled_h  = dst->dimension(idx_height);
+    const bool         is_square           = pool_size_x == pool_size_y;
+    const unsigned int pooled_w            = dst->dimension(idx_width);
+    const unsigned int pooled_h            = dst->dimension(idx_height);
 
     //If it's not squared and optimized will be executed the MxN
     num_elems_processed_per_iteration = 1;
 
-    if(is_square)
+    if (is_square)
     {
-        switch(src->data_type())
+        switch (src->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
-                switch(pool_size_x)
+                switch (pool_size_x)
                 {
                     case 2:
                         num_elems_processed_per_iteration = (pool_stride_x == 2) ? 8 : 15;
@@ -261,18 +288,22 @@
     bool   window_changed = false;
     Window win{};
     // Upper limit for the number of right/bottom border elements that are accessed
-    TensorShape dst_shape{ src->tensor_shape() };
+    TensorShape dst_shape{src->tensor_shape()};
     dst_shape.set(0, pooled_w);
     dst_shape.set(1, pooled_h);
     TensorInfo dst_info(src->clone()->set_tensor_shape(dst_shape));
     win = calculate_max_window(dst_info, Steps(num_elems_processed_per_iteration));
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
-void CpuPool2dKernel::configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void CpuPool2dKernel::configure(ITensorInfo            *src,
+                                ITensorInfo            *dst,
+                                const PoolingLayerInfo &pool_info,
+                                ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     const PadStrideInfo pad_stride_info   = pool_info.pad_stride_info;
@@ -284,14 +315,15 @@
     const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Update pool size in case of global pooling
-    const Size2D pool_size(
-        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
+    const Size2D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+                           is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height);
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices, pool_size));
 
-    const auto *uk = CpuPool2dKernel::get_implementation(PoolDataTypeISASelectorData{ src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first, pool_size, CPUInfo::get().get_isa() });
+    const auto *uk = CpuPool2dKernel::get_implementation(
+        PoolDataTypeISASelectorData{src->data_type(), src->data_layout(), (int)pad_stride_info.stride().first,
+                                    pool_size, CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     // Set instance variables
@@ -302,7 +334,7 @@
     _run_method    = uk->ukernel;
     _name          = std::string("CpuPool2dKernel").append("/").append(uk->name);
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         // Configure kernel window
         Window win = calculate_max_window(*dst, Steps());
@@ -311,14 +343,17 @@
     else
     {
         // Configure kernel window
-        auto win_config = validate_and_configure_window(src, dst, indices, pool_info, _num_elems_processed_per_iteration,
-                                                        pool_size.x(), pool_size.y());
+        auto win_config = validate_and_configure_window(
+            src, dst, indices, pool_info, _num_elems_processed_per_iteration, pool_size.x(), pool_size.y());
         ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
         ICpuKernel::configure(win_config.second);
     }
 }
 
-Status CpuPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2dKernel::validate(const ITensorInfo      *src,
+                                 const ITensorInfo      *dst,
+                                 const PoolingLayerInfo &pool_info,
+                                 const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -336,9 +371,10 @@
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices, Size2D(pool_size_x, pool_size_y)));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(),
-                                                              (indices) ? indices->clone().get() : nullptr, pool_info, num_elems_processed_per_iteration,
-                                                              pool_size_x, pool_size_y)
-                                .first);
+                                                              (indices) ? indices->clone().get() : nullptr, pool_info,
+                                                              num_elems_processed_per_iteration, pool_size_x,
+                                                              pool_size_y)
+                                    .first);
 
     return Status{};
 }
@@ -359,19 +395,20 @@
     const unsigned int pool_size     = _pool_info.pool_size.width;
 
     Window window_src(window);
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         // Set step for src in x and y direction for the src
         unsigned int window_x_inc = 0;
-        switch(src->info()->data_type())
+        switch (src->info()->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
             {
                 window_x_inc = pool_stride_x;
-                if((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
+                if ((pool_size == 2 || pool_size == 3) && pool_stride_x < 3)
                 {
-                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration;
+                    window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2
+                                                        : _num_elems_processed_per_iteration;
                 }
                 break;
             }
@@ -387,8 +424,10 @@
                 ARM_COMPUTE_ERROR("Not supported");
             }
         }
-        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc));
-        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y));
+        window_src.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x,
+                                                       window.x().end() * pool_stride_x, window_x_inc));
+        window_src.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y,
+                                                       window.y().end() * pool_stride_y, pool_stride_y));
     }
     else
     {

diff --git a/src/cpu/kernels/CpuPool2dKernel.h b/src/cpu/kernels/CpuPool2dKernel.h
index c952ea8..859de8c 100644
--- a/src/cpu/kernels/CpuPool2dKernel.h
+++ b/src/cpu/kernels/CpuPool2dKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL2D_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -38,7 +39,8 @@
 class CpuPool2dKernel : public ICpuKernel<CpuPool2dKernel>
 {
 private:
-    using PoolingKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
+    using PoolingKernelPtr = std::add_pointer<void(
+        const ITensor *, ITensor *, ITensor *, PoolingLayerInfo &, const Window &, const Window &)>::type;
 
 public:
     CpuPool2dKernel() = default;
@@ -52,17 +54,21 @@
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out] indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void
+    configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuPool2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct PoolingKernel
@@ -76,11 +82,11 @@
 
 private:
     PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 0 };
+    DataLayout       _data_layout{DataLayout::UNKNOWN};
+    unsigned int     _num_elems_processed_per_iteration{0};
     Size2D           _pool_size{};
     int              _pool_stride_x{};
-    PoolingKernelPtr _run_method{ nullptr };
+    PoolingKernelPtr _run_method{nullptr};
     std::string      _name{};
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
index 4504f3f..8b484d4 100644
--- a/src/cpu/kernels/CpuPool3dKernel.cpp
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp

@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/pool3d/list.h"
@@ -41,39 +42,28 @@
 {
 using namespace misc::shape_calculator;
 
-static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels =
-{
-    {
-        "neon_qu8_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)
-    },
-    {
-        "neon_qs8_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)
-    },
-    {
-        "neon_fp16_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); },
-        REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)
-    },
-    {
-        "neon_fp32_ndhwc_poolMxNxD",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)
-    }
-};
+static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels = {
+    {"neon_qu8_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_q8_pool3d)},
+    {"neon_qs8_ndhwc_poolMxNxD",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_q8_signed_pool3d)},
+    {"neon_fp16_ndhwc_poolMxNxD",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16 && data.isa.fp16); },
+     REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)},
+    {"neon_fp32_ndhwc_poolMxNxD", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)}};
 
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
-                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+                                        (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
                                     "Exclude padding is unsupported for non-float types for Avg op");
 
     const auto data_layout = src->data_layout();
@@ -97,21 +87,26 @@
     int output_height = 0;
     int output_depth  = 0;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
 
-    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], src->tensor_shape()[idx_depth],
-                                                                                      pool_size_x, pool_size_y, pool_size_z, pool_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                    src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                                    "Calculated output dimension size is invalid");
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-        TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
+        TensorInfo out_info(
+            TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
     }
 
-    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     return Status{};
@@ -136,12 +131,12 @@
 
     // Update pool size in case of global pooling
     const bool   is_global_pooling = pool_info.is_global_pooling;
-    const Size3D pool_size(
-        is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
-        is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
-        is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
+    const Size3D pool_size(is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+                           is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
+                           is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
 
-    const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk =
+        CpuPool3dKernel::get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr);
 
     // Set instance variables
@@ -188,4 +183,4 @@
 
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
index 437f2af..bd1ff61 100644
--- a/src/cpu/kernels/CpuPool3dKernel.h
+++ b/src/cpu/kernels/CpuPool3dKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL3D_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,7 +40,8 @@
 {
 private:
     /* Template function for Pooling 3D NDHWC */
-    using Pooling3dKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
+    using Pooling3dKernelPtr =
+        std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
 
 public:
     CpuPool3dKernel() = default;
@@ -68,7 +70,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct Pooling3dKernel
@@ -82,11 +84,11 @@
 
 private:
     Pooling3dLayerInfo _pool_info{};
-    Pooling3dKernelPtr _run_method{ nullptr };
+    Pooling3dKernelPtr _run_method{nullptr};
     std::string        _name{};
 };
 
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */
\ No newline at end of file
+#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */

diff --git a/src/cpu/kernels/CpuQuantizeKernel.cpp b/src/cpu/kernels/CpuQuantizeKernel.cpp
index 9700c62..5dde680 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.cpp
+++ b/src/cpu/kernels/CpuQuantizeKernel.cpp

@@ -28,13 +28,13 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/CPP/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/CPP/Validate.h"
 
 #include <arm_neon.h>
 #include <map>
@@ -53,9 +53,11 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QSYMM8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QASYMM16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
 
     return Status{};
@@ -71,19 +73,15 @@
 template <>
 inline float32x4x4_t load_value(const float *input_ptr)
 {
-    return { wrapper::vloadq(input_ptr),
-             wrapper::vloadq(input_ptr + 4),
-             wrapper::vloadq(input_ptr + 8),
-             wrapper::vloadq(input_ptr + 12) };
+    return {wrapper::vloadq(input_ptr), wrapper::vloadq(input_ptr + 4), wrapper::vloadq(input_ptr + 8),
+            wrapper::vloadq(input_ptr + 12)};
 }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
 inline float32x4x4_t load_value(const float16_t *input_ptr)
 {
-    return { vcvt_f32_f16(wrapper::vload(input_ptr)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 8)),
-             vcvt_f32_f16(wrapper::vload(input_ptr + 12)) };
+    return {vcvt_f32_f16(wrapper::vload(input_ptr)), vcvt_f32_f16(wrapper::vload(input_ptr + 4)),
+            vcvt_f32_f16(wrapper::vload(input_ptr + 8)), vcvt_f32_f16(wrapper::vload(input_ptr + 12))};
 }
 
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
@@ -113,26 +111,25 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
-    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map =
-    {
-        { "op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t> },
-        { "op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t> },
-        { "op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t> },
+    static const std::map<std::string, QuantizeFunctionExecutorPtr> quant_map = {
+        {"op_QASYMM8_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, uint8_t>},
+        {"op_QASYMM8_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<uint8_t, int8_t>},
+        {"op_QASYMM8_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<uint8_t>},
 
-        { "op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t> },
-        { "op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t> },
+        {"op_QASYMM8_SIGNED_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, uint8_t>},
+        {"op_QASYMM8_SIGNED_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<int8_t, int8_t>},
+        {"op_QASYMM8_SIGNED_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<int8_t>},
 
-        { "op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t> },
+        {"op_F32_QSYMM8", &CpuQuantizeKernel::run_quantize_qsymm8<float, int8_t>},
 
-        { "op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t> },
-        { "op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t> },
-        { "op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float> },
+        {"op_F32_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float, uint8_t>},
+        {"op_F32_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float, int8_t>},
+        {"op_F32_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float>},
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-        { "op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t> },
-        { "op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t> },
-        { "op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t> },
+        {"op_F16_QASYMM8", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, uint8_t>},
+        {"op_F16_QASYMM8_SIGNED", &CpuQuantizeKernel::run_quantize_qasymm8<float16_t, int8_t>},
+        {"op_F16_QASYMM16", &CpuQuantizeKernel::run_quantize_qasymm16<float16_t>},
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC*/
     };
 
@@ -142,7 +139,7 @@
 
     auto it = quant_map.find(function_to_call);
 
-    if(it == quant_map.end())
+    if (it == quant_map.end())
     {
         ARM_COMPUTE_ERROR("Unsupported combination of input and output data types");
     }
@@ -167,7 +164,7 @@
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -177,22 +174,24 @@
 
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
-        }
-    },
-    input, output);
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+            int  x          = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qsymm8(input_ptr[x], dst->info()->quantization_info());
+            }
+        },
+        input, output);
 }
 
 template <typename TIn, typename TOut>
@@ -203,7 +202,7 @@
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -219,23 +218,25 @@
 
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
-        auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const TIn *>(input.ptr());
+            auto output_ptr = reinterpret_cast<TOut *>(output.ptr());
 
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                wrapper::vstore(&output_ptr[x], vquantize_qasymm8<TOut>(load_value(&input_ptr[x]), uqinfo));
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = Qasymm8QuantizationHelper<TOut>::quantize(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
 }
 
 template <typename T>
@@ -246,7 +247,7 @@
 
     const UniformQuantizationInfo uqinfo_in = src->info()->quantization_info().uniform();
     UniformQuantizationInfo       uqinfo    = dst->info()->quantization_info().uniform();
-    if(is_data_type_quantized_asymmetric(src->info()->data_type()))
+    if (is_data_type_quantized_asymmetric(src->info()->data_type()))
     {
         uqinfo = compute_requantization_scale_offset(uqinfo_in, uqinfo);
     }
@@ -262,25 +263,27 @@
 
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
 
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step); x += window_step)
-        {
-            uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
-            vst1q_u16(&output_ptr[x], tmp.val[0]);
-            vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
-        }
-    },
-    input, output);
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step); x += window_step)
+            {
+                uint16x8x2_t tmp = vquantize_qasymm16(load_value(&input_ptr[x]), uqinfo);
+                vst1q_u16(&output_ptr[x], tmp.val[0]);
+                vst1q_u16(&output_ptr[x + 8], tmp.val[1]);
+            }
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                output_ptr[x] = quantize_qasymm16(input_ptr[x], uqinfo, rounding_policy);
+            }
+        },
+        input, output);
 }
 
 void CpuQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)

diff --git a/src/cpu/kernels/CpuQuantizeKernel.h b/src/cpu/kernels/CpuQuantizeKernel.h
index 2bc8105..d671413 100644
--- a/src/cpu/kernels/CpuQuantizeKernel.h
+++ b/src/cpu/kernels/CpuQuantizeKernel.h

@@ -59,7 +59,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
 private:
@@ -67,7 +67,9 @@
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
+    using QuantizeFunctionExecutorPtr = void (CpuQuantizeKernel::*)(const ITensor *src,
+                                                                    ITensor       *dst,
+                                                                    const Window  &window);
     /** Function to apply QASYMM8 or QASYMM8_SIGNED quantization on a tensor.
      *
      * @param[in] window Region on which to execute the kernel.
@@ -84,7 +86,7 @@
     template <typename TIn, typename TOut>
     void run_quantize_qsymm8(const ITensor *src, ITensor *dst, const Window &window);
 
-    QuantizeFunctionExecutorPtr _func{ nullptr };
+    QuantizeFunctionExecutorPtr _func{nullptr};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuReshapeKernel.cpp b/src/cpu/kernels/CpuReshapeKernel.cpp
index a9672a8..241e58f 100644
--- a/src/cpu/kernels/CpuReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuReshapeKernel.cpp

@@ -29,9 +29,11 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/NEON/INEKernel.h"
+
 #include "src/core/helpers/Utils.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
+
 #include <cstdint>
 
 /** [NEReshapeLayerKernel Kernel] **/
@@ -49,7 +51,7 @@
     // Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->tensor_shape().total_size() != 0)
+    if (dst->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -59,29 +61,30 @@
     return Status{};
 }
 
-
 template <typename T>
 void reshape_tensor_per_element(const Window &window, const ITensor *src, ITensor *dst)
 {
     const TensorShape &src_shape = src->info()->tensor_shape();
     const TensorShape &dst_shape = dst->info()->tensor_shape();
 
-    Iterator    dst_it(dst, window);
+    Iterator dst_it(dst, window);
 
-    execute_window_loop(window, [&](const Coordinates & dst_coord)
-    {
-        Coordinates src_coord = index2coords(src_shape, coords2index(dst_shape, dst_coord));
-        const auto output_ptr = dst->ptr_to_element(dst_coord);
-        const auto input_ptr  = src->ptr_to_element(src_coord);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &dst_coord)
+        {
+            Coordinates src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+            const auto  output_ptr = dst->ptr_to_element(dst_coord);
+            const auto  input_ptr  = src->ptr_to_element(src_coord);
 
-        *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
-    },
-    dst_it);
+            *reinterpret_cast<T *>(output_ptr) = *reinterpret_cast<T *>(input_ptr);
+        },
+        dst_it);
 }
 
-void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst )
+void reshape_tensor_per_element_selector(const Window &window, const ITensor *src, ITensor *dst)
 {
-    switch(src->info()->data_type())
+    switch (src->info()->data_type())
     {
         case DataType::U8:
         case DataType::S8:
@@ -131,22 +134,24 @@
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator dst_it(dst, win);
-    execute_window_loop(win, [&]( Coordinates & id)
-    {
-        dst_coord = id;
-
-        for(int x = window_start_x; x < window_end_x; x += src_row_size)
+    execute_window_loop(
+        win,
+        [&](Coordinates &id)
         {
-            src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
-            output_ptr = dst->ptr_to_element(dst_coord);
-            input_ptr  = src->ptr_to_element(src_coord);
+            dst_coord = id;
 
-            std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+            for (int x = window_start_x; x < window_end_x; x += src_row_size)
+            {
+                src_coord  = index2coords(src_shape, coords2index(dst_shape, dst_coord));
+                output_ptr = dst->ptr_to_element(dst_coord);
+                input_ptr  = src->ptr_to_element(src_coord);
 
-            dst_coord.increment(Window::DimX, src_row_size);
-        }
-    },
-    dst_it);
+                std::memcpy(output_ptr, input_ptr, row_size_in_bytes);
+
+                dst_coord.increment(Window::DimX, src_row_size);
+            }
+        },
+        dst_it);
 }
 
 void reshape_tensor_per_window(const Window &window, const ITensor *src, ITensor *dst)
@@ -213,8 +218,8 @@
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    const ITensorInfo* src_info = src->info();
-    const ITensorInfo* dst_info = dst->info();
+    const ITensorInfo *src_info = src->info();
+    const ITensorInfo *dst_info = dst->info();
 
     // Calculate kernel window based on the padding info
     Window win;
@@ -226,7 +231,7 @@
     const auto src_row_size       = static_cast<int>(src_info->tensor_shape()[0]);
     const auto dst_row_size       = static_cast<int>(dst_info->tensor_shape()[0]);
 
-    if(!src_has_holes && !dst_has_holes)
+    if (!src_has_holes && !dst_has_holes)
     {
         std::tie(win, _split_dimension) = calculate_squashed_or_max_window(*dst_info);
         /*

diff --git a/src/cpu/kernels/CpuReshapeKernel.h b/src/cpu/kernels/CpuReshapeKernel.h
index eddbbf7..ce566fd 100644
--- a/src/cpu/kernels/CpuReshapeKernel.h
+++ b/src/cpu/kernels/CpuReshapeKernel.h

@@ -55,7 +55,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Prepare the reshape kernel for execution (Only executed once) by calculating max or squashed window and selecting the _reshape_tensor_fn based on the presence of holes
@@ -84,10 +84,9 @@
     }
 
 private:
-    size_t               _split_dimension{ Window::DimY };
+    size_t _split_dimension{Window::DimY};
 
-    std::function<void(const Window &window, const ITensor *src, ITensor *dst )>  _reshape_tensor_fn{};
-
+    std::function<void(const Window &window, const ITensor *src, ITensor *dst)> _reshape_tensor_fn{};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuScaleKernel.cpp b/src/cpu/kernels/CpuScaleKernel.cpp
index 3323045..702e0a8 100644
--- a/src/cpu/kernels/CpuScaleKernel.cpp
+++ b/src/cpu/kernels/CpuScaleKernel.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/InterpolationPolicyUtils.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/common/Registrars.h"
 #include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -44,104 +45,74 @@
 {
 namespace
 {
-static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels =
-{
-    {
-        "sve_fp16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)
-    },
-    {
-        "sve_fp32_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)
-    },
-    {
-        "sve_qu8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)
-    },
-    {
-        "sve_qs8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)
-    },
-    {
-        "sve_u8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)
-    },
-    {
-        "sve_s16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data)
-        {
-            return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR;
-        },
-        REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)
-    },
-    {
-        "neon_fp16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F16 && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)
-    },
-    {
-        "neon_fp32_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::F32; },
-        REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)
-    },
-    {
-        "neon_qu8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8; },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)
-    },
-    {
-        "neon_qs8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::QASYMM8_SIGNED; },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)
-    },
-    {
-        "neon_u8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::U8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)
-    },
-    {
-        "neon_s8_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S8; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)
-    },
-    {
-        "neon_s16_scale",
-        [](const ScaleKernelDataTypeISASelectorData & data) { return data.dt == DataType::S16; },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)
-    },
+static const std::vector<CpuScaleKernel::ScaleKernel> available_kernels = {
+    {"sve_fp16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::F16 && data.isa.sve && data.isa.fp16 &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_FP16_SVE(arm_compute::cpu::fp16_sve_scale)},
+    {"sve_fp32_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::F32 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_FP32_SVE(arm_compute::cpu::fp32_sve_scale)},
+    {"sve_qu8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) {
+         return data.dt == DataType::QASYMM8 && data.isa.sve &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_QASYMM8_SVE(arm_compute::cpu::qasymm8_sve_scale)},
+    {"sve_qs8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     {
+         return data.dt == DataType::QASYMM8_SIGNED && data.isa.sve &&
+                data.interpolation_policy != InterpolationPolicy::BILINEAR;
+     },
+     REGISTER_QASYMM8_SIGNED_SVE(arm_compute::cpu::qasymm8_signed_sve_scale)},
+    {"sve_u8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::U8 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::u8_sve_scale)},
+    {"sve_s16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data)
+     { return data.dt == DataType::S16 && data.isa.sve && data.interpolation_policy != InterpolationPolicy::BILINEAR; },
+     REGISTER_INTEGER_SVE(arm_compute::cpu::s16_sve_scale)},
+    {"neon_fp16_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F16 && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::common_neon_scale<float16_t>)},
+    {"neon_fp32_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::F32; },
+     REGISTER_FP32_NEON(arm_compute::cpu::common_neon_scale<float>)},
+    {"neon_qu8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8; },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::qasymm8_neon_scale)},
+    {"neon_qs8_scale",
+     [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::QASYMM8_SIGNED; },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::qasymm8_signed_neon_scale)},
+    {"neon_u8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::U8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::u8_neon_scale)},
+    {"neon_s8_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S8; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s8_neon_scale)},
+    {"neon_s16_scale", [](const ScaleKernelDataTypeISASelectorData &data) { return data.dt == DataType::S16; },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::s16_neon_scale)},
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy,
-                          const ITensorInfo *offsets, ITensorInfo *dst, const ScaleKernelInfo &info)
+Status validate_arguments(const ITensorInfo     *src,
+                          const ITensorInfo     *dx,
+                          const ITensorInfo     *dy,
+                          const ITensorInfo     *offsets,
+                          ITensorInfo           *dst,
+                          const ScaleKernelInfo &info)
 {
-    const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy });
+    const auto *uk = CpuScaleKernel::get_implementation(
+        ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+                                info.sampling_policy != SamplingPolicy::TOP_LEFT);
     ARM_COMPUTE_UNUSED(info.constant_border_value);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.use_padding, "Padding is not supported");
 
@@ -153,27 +124,30 @@
     ARM_COMPUTE_RETURN_ERROR_ON(output_width == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(output_height == 0);
 
-    ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) && (data_layout != DataLayout::NHWC || info.interpolation_policy != InterpolationPolicy::BILINEAR
-                                                                       || info.border_mode != BorderMode::REPLICATE));
+    ARM_COMPUTE_RETURN_ERROR_ON((src->data_type() == DataType::S8) &&
+                                (data_layout != DataLayout::NHWC ||
+                                 info.interpolation_policy != InterpolationPolicy::BILINEAR ||
+                                 info.border_mode != BorderMode::REPLICATE));
 
-    if(info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr)
+    if (info.interpolation_policy == InterpolationPolicy::NEAREST_NEIGHBOR && offsets != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
     }
 
-    if(info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr)
+    if (info.interpolation_policy == InterpolationPolicy::BILINEAR && offsets != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32);
-        if(dx != nullptr && dy != nullptr)
+        if (dx != nullptr && dy != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32);
         }
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners &&
+                                !scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
 
-    if(info.interpolation_policy == InterpolationPolicy::AREA)
+    if (info.interpolation_policy == InterpolationPolicy::AREA)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(data_layout != DataLayout::NCHW);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8);
@@ -183,24 +157,28 @@
 }
 } // namespace
 
-void CpuScaleKernel::configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets,
-                               ITensorInfo *dst, const ScaleKernelInfo &info)
+void CpuScaleKernel::configure(const ITensorInfo     *src,
+                               const ITensorInfo     *dx,
+                               const ITensorInfo     *dy,
+                               const ITensorInfo     *offsets,
+                               ITensorInfo           *dst,
+                               const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(dx, dy, offsets);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  dx,
-                                                  dy,
-                                                  offsets,
-                                                  dst,
-                                                  info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dx, dy, offsets, dst, info));
 
-    const auto *uk = CpuScaleKernel::get_implementation(ScaleKernelDataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy });
+    const auto *uk = CpuScaleKernel::get_implementation(
+        ScaleKernelDataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa(), info.interpolation_policy});
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
     _run_method = uk->ukernel;
-    _name       = std::string("CpuScaleKernel").append("/").append(uk->name).append("_").append(string_from_interpolation_policy(info.interpolation_policy));
+    _name       = std::string("CpuScaleKernel")
+                .append("/")
+                .append(uk->name)
+                .append("_")
+                .append(string_from_interpolation_policy(info.interpolation_policy));
 
     // Get data layout and width/height indices
     _data_layout         = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
@@ -212,19 +190,22 @@
     _constant_border_value = info.constant_border_value;
     _align_corners         = info.align_corners;
 
-    if(info.sampling_policy == SamplingPolicy::CENTER)
+    if (info.sampling_policy == SamplingPolicy::CENTER)
     {
         _sampling_offset = 0.5f;
     }
 
     // Compute the ratio between source width/height and destination width/height
-    const auto wr = scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
-    const auto hr = scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
+    const auto wr =
+        scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), _align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), _align_corners);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : _policy;
+    _policy = (_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR
+                                                                               : _policy;
 
-    if(_border_mode == BorderMode::UNDEFINED)
+    if (_border_mode == BorderMode::UNDEFINED)
     {
         _border_mode           = BorderMode::CONSTANT;
         _constant_border_value = PixelValue();
@@ -232,39 +213,38 @@
 
 #ifdef ENABLE_NCHW_KERNELS
     // Configure scale function to run
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         std::string function_to_call("scale_");
         function_to_call += string_from_data_type(src->data_type()) + "_";
         function_to_call += string_from_data_layout(_data_layout) + "_";
         function_to_call += string_from_interpolation_policy(_policy);
 
-        static std::map<std::string, ScaleFunctionPtr> map_function =
-        {
-            { "scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8 },
+        static std::map<std::string, ScaleFunctionPtr> map_function = {
+            {"scale_U8_NCHW_AREA_CONSTANT", &CpuScaleKernel::scale_area_nchw_u8},
 
-            { "scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t> },
-            { "scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
+            {"scale_U8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<uint8_t>},
+            {"scale_U8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t>},
 
-            { "scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t> },
-            { "scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t> },
+            {"scale_QASYMM8_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<uint8_t>},
+            {"scale_QASYMM8_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<uint8_t>},
 
-            { "scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t> },
-            { "scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t> },
+            {"scale_QASYMM8_SIGNED_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_qasymm<int8_t>},
+            {"scale_QASYMM8_SIGNED_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int8_t>},
 
-            { "scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t> },
-            { "scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t> },
+            {"scale_S16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<int16_t>},
+            {"scale_S16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<int16_t>},
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-            { "scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t> },
-            { "scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t> },
+            {"scale_F16_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float16_t>},
+            {"scale_F16_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float16_t>},
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-            { "scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float> },
-            { "scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float> },
+            {"scale_F32_NCHW_BILINEAR", &CpuScaleKernel::scale_bilinear_nchw<float>},
+            {"scale_F32_NCHW_NEAREST_NEIGHBOUR", &CpuScaleKernel::scale_nearest_nchw<float>},
         };
         auto it = map_function.find(function_to_call);
-        if(it != map_function.end())
+        if (it != map_function.end())
         {
             _func = it->second;
         }
@@ -278,13 +258,19 @@
 
 #ifdef ENABLE_NCHW_KERNELS
 template <typename T>
-void CpuScaleKernel::scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_nearest_nchw(const ITensor *src,
+                                        ITensor       *dst,
+                                        const ITensor *dx,
+                                        const ITensor *dy,
+                                        const ITensor *offsets,
+                                        const Window  &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy);
     const size_t in_stride_x = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
 
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
 
     // Don't increment in X and Y direction for the input tensor
     // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -296,7 +282,7 @@
     Window win_off;
     win_off.set(Window::DimX, window[Window::DimX]);
     win_off.set(Window::DimY, window[Window::DimY]);
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
     {
         win_off.set(d, Window::Dimension(0, 0, 0));
     }
@@ -305,24 +291,33 @@
     Iterator src_i(src, win_in);
     Iterator dst_i(dst, window);
     Iterator offsets_i(offsets, win_off);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
-        const auto in_yi       = static_cast<int32_t>(_align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr) : std::floor((
-                                                          id.y() + _sampling_offset)
-                                                      * hr));
-        const int32_t offset_row            = in_yi * in_stride_x;
-        *reinterpret_cast<T *>(dst_i.ptr()) = *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
-    },
-    src_i, offsets_i, dst_i);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto offsets_ptr = reinterpret_cast<const int32_t *>(offsets_i.ptr());
+            const auto in_yi       = static_cast<int32_t>(
+                _align_corners ? utils::rounding::round_half_away_from_zero((id.y() + _sampling_offset) * hr)
+                                     : std::floor((id.y() + _sampling_offset) * hr));
+            const int32_t offset_row = in_yi * in_stride_x;
+            *reinterpret_cast<T *>(dst_i.ptr()) =
+                *(reinterpret_cast<const T *>(src_i.ptr()) + offsets_ptr[0] + offset_row);
+        },
+        src_i, offsets_i, dst_i);
 }
 
 template <typename T>
-void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_bilinear_nchw(const ITensor *src,
+                                         ITensor       *dst,
+                                         const ITensor *dx,
+                                         const ITensor *dy,
+                                         const ITensor *offsets,
+                                         const Window  &window)
 {
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
-    Window     win_off;
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
+    Window win_off;
     win_off.set(Window::DimX, window.x());
     win_off.set(Window::DimY, window.y());
 
@@ -332,7 +327,7 @@
     win_in.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_in.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
     {
         win_off.set(d, Window::Dimension(0, 0, 0));
     }
@@ -347,7 +342,7 @@
     const int32_t in_dim_h    = src->info()->dimension(1);
     const int32_t in_stride_w = in_dim_w + src->info()->padding().left + src->info()->padding().right;
 
-    if(_border_mode == BorderMode::CONSTANT)
+    if (_border_mode == BorderMode::CONSTANT)
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -355,52 +350,60 @@
         using ConstType = T;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+                const auto    index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+                const auto    dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
+                const auto    dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
+                const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
 
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + index_h * in_stride_w)) : const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w)) : const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h
-                              && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w)) :
-                             const_border_value;
+                const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w + index_h * in_stride_w))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w + index_h * in_stride_w + in_stride_w))
+                                     : const_border_value;
+                const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w + 1 + index_h * in_stride_w + in_stride_w))
+                                     : const_border_value;
 
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
+                *reinterpret_cast<T *>(dst_i.ptr()) =
+                    static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            src_i, offsets_i, dx_i, dy_i, dst_i);
     }
-    else if(_border_mode == BorderMode::REPLICATE)
+    else if (_border_mode == BorderMode::REPLICATE)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
-            const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
-            const auto dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
-            const auto dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
-            const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int  index_h       = std::floor((id.y() + _sampling_offset) * hr - _sampling_offset);
+                const auto index_w       = *(reinterpret_cast<const int32_t *>(offsets_i.ptr()));
+                const auto dx_val        = *(reinterpret_cast<const float *>(dx_i.ptr()));
+                const auto dy_val        = *(reinterpret_cast<const float *>(dy_i.ptr()));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
 
-            auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+                auto clamped_x  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+                auto clamped_x1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+                auto clamped_y  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+                auto clamped_y1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
 
-            const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
-            const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
-            const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
-            const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
+                const auto a00 = *(pixel_row_ptr + clamped_x + clamped_y * in_stride_w);
+                const auto a01 = *(pixel_row_ptr + clamped_x1 + clamped_y * in_stride_w);
+                const auto a10 = *(pixel_row_ptr + clamped_x + clamped_y1 * in_stride_w);
+                const auto a11 = *(pixel_row_ptr + clamped_x1 + clamped_y1 * in_stride_w);
 
-            *reinterpret_cast<T *>(dst_i.ptr()) = static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        src_i, offsets_i, dx_i, dy_i, dst_i);
+                *reinterpret_cast<T *>(dst_i.ptr()) =
+                    static_cast<T>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            src_i, offsets_i, dx_i, dy_i, dst_i);
     }
     else
     {
@@ -408,7 +411,12 @@
     }
 }
 
-void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_area_nchw_u8(const ITensor *src,
+                                        ITensor       *dst,
+                                        const ITensor *dx,
+                                        const ITensor *dy,
+                                        const ITensor *offsets,
+                                        const Window  &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, offsets);
     using namespace scale_helpers;
@@ -425,50 +433,60 @@
     Iterator src_i(src, win_in);
     Iterator dst_i(dst, window);
 
-    const auto   wr        = scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
-    const auto   hr        = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
+    const auto wr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(0), dst->info()->dimension(0), _align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), _align_corners);
     const auto   w         = src->info()->dimension(0);
     const auto   h         = src->info()->dimension(1);
     const size_t in_stride = src->info()->strides_in_bytes()[1];
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto in_ptr = reinterpret_cast<const uint8_t *>(src_i.ptr());
 
-        uint8x8_t tmp0 = vdup_n_u8(0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
-        tmp0           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
+            uint8x8_t tmp0 = vdup_n_u8(0);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6);
+            tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7);
 
-        uint8x8_t tmp1 = vdup_n_u8(0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
-        tmp1           = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
+            uint8x8_t tmp1 = vdup_n_u8(0);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6);
+            tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7);
 
-        vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
-    },
-    src_i, dst_i);
+            vst1q_u8(dst_i.ptr(), vcombine_u8(tmp0, tmp1));
+        },
+        src_i, dst_i);
 }
 
 template <typename T>
-void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window)
+void CpuScaleKernel::scale_bilinear_qasymm(const ITensor *src,
+                                           ITensor       *dst,
+                                           const ITensor *dx,
+                                           const ITensor *dy,
+                                           const ITensor *offsets,
+                                           const Window  &window)
 {
     // Get data layout and width/height indices
     const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), _align_corners);
+    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height),
+                                                        dst->info()->dimension(idx_height), _align_corners);
     Window     win_off;
     win_off.set(Window::DimX, Window::Dimension(0, 0, 0));
     win_off.set(Window::DimY, Window::Dimension(0, 0, 0));
@@ -479,7 +497,7 @@
     win_in.set(idx_width, Window::Dimension(0, 0, 0));
     win_in.set(idx_height, Window::Dimension(0, 0, 0));
 
-    for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+    for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
     {
         win_off.set(d, Window::Dimension(0, 0, 0));
     }
@@ -495,7 +513,7 @@
     const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
 
-    if(_border_mode == BorderMode::CONSTANT)
+    if (_border_mode == BorderMode::CONSTANT)
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -503,62 +521,74 @@
         using ConstType = T;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         const T const_border_value = static_cast<T>(_constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+                const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+                    offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto    dx_val =
+                    *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto dy_val =
+                    *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
 
-            const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h)) :
-                             const_border_value;
+                const auto a00 = (0 <= index_w && index_w < in_dim_w && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + index_w * stride_w + index_h * stride_h))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w < in_dim_w - 1 && 0 <= index_h && index_h < in_dim_h)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * stride_w + index_h * stride_h))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < in_dim_w && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + index_w * stride_w + (index_h + 1) * stride_h))
+                                     : const_border_value;
+                const auto a11 = (-1 <= index_w && index_w < in_dim_w - 1 && -1 <= index_h && index_h < in_dim_h - 1)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * stride_w + (index_h + 1) * stride_h))
+                                     : const_border_value;
 
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
+                const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+                const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+                const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+                const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+                *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            src_i, dst_i);
     }
-    else if(_border_mode == BorderMode::REPLICATE)
+    else if (_border_mode == BorderMode::REPLICATE)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int     index_h       = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int     index_h = std::floor((id[idx_height] + _sampling_offset) * hr - _sampling_offset);
+                const int32_t index_w = *(reinterpret_cast<const int32_t *>(
+                    offsets->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto    dx_val =
+                    *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto dy_val =
+                    *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[idx_width], id[idx_height]))));
+                const auto pixel_row_ptr = reinterpret_cast<const T *>(src_i.ptr());
 
-            auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
+                auto clamped_w  = utility::clamp<int>(index_w, 0, in_dim_w - 1);
+                auto clamped_w1 = utility::clamp<int>(index_w + 1, 0, in_dim_w - 1);
+                auto clamped_h  = utility::clamp<int>(index_h, 0, in_dim_h - 1);
+                auto clamped_h1 = utility::clamp<int>(index_h + 1, 0, in_dim_h - 1);
 
-            const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
-            const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
-            const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
-            const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
+                const auto a00 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h * stride_h);
+                const auto a01 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h * stride_h);
+                const auto a10 = *(pixel_row_ptr + clamped_w * stride_w + clamped_h1 * stride_h);
+                const auto a11 = *(pixel_row_ptr + clamped_w1 * stride_w + clamped_h1 * stride_h);
 
-            const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
-            const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
-            const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
-            const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
-            *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        src_i, dst_i);
+                const float inp00                   = Qasymm8QuantizationHelper<T>::dequantize(a00, iq_info);
+                const float inp01                   = Qasymm8QuantizationHelper<T>::dequantize(a01, iq_info);
+                const float inp10                   = Qasymm8QuantizationHelper<T>::dequantize(a10, iq_info);
+                const float inp11                   = Qasymm8QuantizationHelper<T>::dequantize(a11, iq_info);
+                *reinterpret_cast<T *>(dst_i.ptr()) = Qasymm8QuantizationHelper<T>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            src_i, dst_i);
     }
     else
     {
@@ -567,8 +597,12 @@
 }
 #endif // ENABLE_NCHW_KERNELS
 
-Status CpuScaleKernel::validate(const ITensorInfo *input, const ITensorInfo *dx, const ITensorInfo *dy,
-                                const ITensorInfo *offsets, ITensorInfo *output, const ScaleKernelInfo &info)
+Status CpuScaleKernel::validate(const ITensorInfo     *input,
+                                const ITensorInfo     *dx,
+                                const ITensorInfo     *dy,
+                                const ITensorInfo     *offsets,
+                                ITensorInfo           *output,
+                                const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, dx, dy, offsets, output, info));
     return Status{};
@@ -588,13 +622,14 @@
     const auto dy      = tensors.get_const_tensor(TensorType::ACL_INT_1);
     const auto offsets = tensors.get_const_tensor(TensorType::ACL_INT_2);
 
-    if(_data_layout == DataLayout::NCHW)
+    if (_data_layout == DataLayout::NCHW)
     {
         (this->*_func)(src, dst, dx, dy, offsets, window);
     }
     else
     {
-        _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset, _align_corners, window);
+        _run_method(src, dst, offsets, dx, dy, _policy, _border_mode, _constant_border_value, _sampling_offset,
+                    _align_corners, window);
     }
 }
 

diff --git a/src/cpu/kernels/CpuScaleKernel.h b/src/cpu/kernels/CpuScaleKernel.h
index 8102142..38142df 100644
--- a/src/cpu/kernels/CpuScaleKernel.h
+++ b/src/cpu/kernels/CpuScaleKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_SCALEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 
@@ -39,9 +40,19 @@
 {
 private:
     /** Scale function to use for the particular function to use */
-    using ScaleFunctionPtr = void (CpuScaleKernel::*)(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
-    using ScaleKernelPtr   = std::add_pointer<void(const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *,
-                                                   InterpolationPolicy, BorderMode, PixelValue, float, bool, const Window &)>::type;
+    using ScaleFunctionPtr = void (CpuScaleKernel::*)(
+        const ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const Window &window);
+    using ScaleKernelPtr = std::add_pointer<void(const ITensor *,
+                                                 ITensor *,
+                                                 const ITensor *,
+                                                 const ITensor *,
+                                                 const ITensor *,
+                                                 InterpolationPolicy,
+                                                 BorderMode,
+                                                 PixelValue,
+                                                 float,
+                                                 bool,
+                                                 const Window &)>::type;
 
 public:
     CpuScaleKernel() = default;
@@ -59,7 +70,11 @@
      * @param[out] dst     Destination tensor info. Data types supported: Same as @p input. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info    @ref ScaleKernelInfo to use for configuration
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *dx,
+                   const ITensorInfo     *dy,
+                   const ITensorInfo     *offsets,
+                   ITensorInfo           *dst,
                    const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -67,11 +82,15 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dx, const ITensorInfo *dy, const ITensorInfo *offsets, ITensorInfo *dst,
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *dx,
+                           const ITensorInfo     *dy,
+                           const ITensorInfo     *offsets,
+                           ITensorInfo           *dst,
                            const ScaleKernelInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct ScaleKernel
@@ -89,28 +108,48 @@
      *
      *  @note Used only in case down-sampling.
      */
-    void scale_area_nchw_u8(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+    void scale_area_nchw_u8(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            const ITensor *offsets,
+                            const Window  &window);
 
     /** function to perform scale using bilinear interpolation on the given window */
     template <typename T>
-    void scale_bilinear_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+    void scale_bilinear_nchw(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *dx,
+                             const ITensor *dy,
+                             const ITensor *offsets,
+                             const Window  &window);
     /** function to perform scale using bilinear interpolation on the given window */
     template <typename T>
-    void scale_bilinear_qasymm(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+    void scale_bilinear_qasymm(const ITensor *src,
+                               ITensor       *dst,
+                               const ITensor *dx,
+                               const ITensor *dy,
+                               const ITensor *offsets,
+                               const Window  &window);
 
     /** function to perform scale using nearest neighbour on the given window */
     template <typename T>
-    void scale_nearest_nchw(const ITensor *src, ITensor *dst, const ITensor *dx, const ITensor *dy, const ITensor *offsets, const Window &window);
+    void scale_nearest_nchw(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            const ITensor *offsets,
+                            const Window  &window);
 #endif // ENABLE_NCHW_KERNELS
 
-    ScaleFunctionPtr    _func{ nullptr };
+    ScaleFunctionPtr    _func{nullptr};
     InterpolationPolicy _policy{};
     BorderMode          _border_mode{};
     PixelValue          _constant_border_value{};
-    float               _sampling_offset{ 0 };
-    bool                _align_corners{ false };
-    DataLayout          _data_layout{ DataLayout::UNKNOWN };
-    ScaleKernelPtr      _run_method{ nullptr };
+    float               _sampling_offset{0};
+    bool                _align_corners{false};
+    DataLayout          _data_layout{DataLayout::UNKNOWN};
+    ScaleKernelPtr      _run_method{nullptr};
     std::string         _name{};
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuSoftmaxKernel.cpp b/src/cpu/kernels/CpuSoftmaxKernel.cpp
index e06ab99..ce14435 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.cpp
+++ b/src/cpu/kernels/CpuSoftmaxKernel.cpp

@@ -30,11 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/common/Registrars.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
-#include "src/core/common/Registrars.h"
 #include "src/cpu/kernels/softmax/list.h"
 
 namespace arm_compute
@@ -46,61 +46,44 @@
 namespace
 {
 /* Softmax Logits 1D Max - identifying the max value of 1D Logits  */
-static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits =
-{
-    {
-        "sve_fp32_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
-        REGISTER_FP32_SVE(sve_fp32_logits)
-    },
-    {
-        "sve_fp16_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-        REGISTER_FP16_SVE(sve_fp16_logits)
-    },
-    {
-        "sve_qu8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
-        REGISTER_QASYMM8_SVE(sve_qasymm8_logits)
-    },
-    {
-        "sve_qs8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
-        REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)
-    },
-    {
-        "neon_fp32_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(neon_fp32_logits)
-    },
-    {
-        "neon_fp16_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-        REGISTER_FP16_NEON(neon_fp16_logits)
-    },
-    {
-        "neon_qu8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(neon_qasymm8_logits)
-    },
-    {
-        "neon_qs8_logits_1d_max",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)
-    },
+static const std::vector<CpuLogits1DMaxKernel::SoftmaxLogits1DMaxKernel> available_kernels_max_logits = {
+    {"sve_fp32_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(sve_fp32_logits)},
+    {"sve_fp16_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(sve_fp16_logits)},
+    {"sve_qu8_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve; },
+     REGISTER_QASYMM8_SVE(sve_qasymm8_logits)},
+    {"sve_qs8_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve; },
+     REGISTER_QASYMM8_SIGNED_SVE(sve_qasymm8_signed_logits)},
+    {"neon_fp32_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_logits)},
+    {"neon_fp16_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_logits)},
+    {"neon_qu8_logits_1d_max", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(neon_qasymm8_logits)},
+    {"neon_qs8_logits_1d_max",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(neon_qasymm8_singed_logits)},
 };
 
 Status validate_arguments_logits_1d_max(const ITensorInfo &input, const ITensorInfo &output)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     // Validate in case of configured output
-    if(output.total_size() != 0)
+    if (output.total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input, &output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input, &output);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(), TensorShape(input.tensor_shape()).set(0, 1));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output.tensor_shape(),
+                                                           TensorShape(input.tensor_shape()).set(0, 1));
     }
 
     return Status{};
@@ -121,7 +104,7 @@
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
-    const auto *uk = get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = get_implementation(DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
     _run_method = uk->ukernel;
@@ -158,60 +141,46 @@
 }
 
 /* Softmax Logits 1D  - computation for QASYMM8 with pre-computed max.  */
-template <bool                                                                             IS_LOG>
-static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits =
-{
-    {
-        "sve2_qu8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
-        REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)
-    },
-    {
-        "sve2_qs8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
-        REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)
-    },
-    {
-        "sve_fp32_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32) && data.isa.sve; },
-        REGISTER_FP32_SVE(sve_fp32_softmax)
-    },
-    {
-        "sve_fp16_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
-        REGISTER_FP16_SVE(sve_fp16_softmax)
-    },
+template <bool IS_LOG>
+static const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> available_kernels_logits = {
+    {"sve2_qu8_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8) && data.isa.sve2; },
+     REGISTER_QASYMM8_SVE2(sve2_qasymm8_softmax)},
+    {"sve2_qs8_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED) && data.isa.sve2; },
+     REGISTER_QASYMM8_SIGNED_SVE2(sve2_qasymm8_signed_softmax)},
+    {"sve_fp32_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32) && data.isa.sve; },
+     REGISTER_FP32_SVE(sve_fp32_softmax)},
+    {"sve_fp16_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.sve && data.isa.fp16; },
+     REGISTER_FP16_SVE(sve_fp16_softmax)},
 
-    {
-        "neon_fp32_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(neon_fp32_softmax)
-    },
-    {
-        "neon_fp16_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-        REGISTER_FP16_NEON(neon_fp16_softmax)
-    },
-    {
-        "neon_qu8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)
-    },
-    {
-        "neon_qs8_softmax_logits_1d",
-        [](const DataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)
-    },
+    {"neon_fp32_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(neon_fp32_softmax)},
+    {"neon_fp16_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(neon_fp16_softmax)},
+    {"neon_qu8_softmax_logits_1d", [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::neon_qasymm8_softmax)},
+    {"neon_qs8_softmax_logits_1d",
+     [](const DataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::neon_qasymm8_signed_softmax)},
 };
 namespace
 {
-Status validate_arguments_logits_softmax(const ITensorInfo &src, const ITensorInfo &max,
-                                         const ITensorInfo &dst, const float beta, const ITensorInfo &tmp, bool is_log)
+Status validate_arguments_logits_softmax(const ITensorInfo &src,
+                                         const ITensorInfo &max,
+                                         const ITensorInfo &dst,
+                                         const float        beta,
+                                         const ITensorInfo &tmp,
+                                         bool               is_log)
 {
     ARM_COMPUTE_UNUSED(beta);
     // Check input
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
@@ -221,16 +190,18 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&src, &max);
 
     // Check output if configured
-    if(dst.total_size() != 0)
+    if (dst.total_size() != 0)
     {
-        const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log) : dst.quantization_info();
+        const QuantizationInfo output_quantization =
+            is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src.data_type(), is_log)
+                                    : dst.quantization_info();
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != output_quantization);
     }
 
     // Check tmp if configured
-    if(tmp.total_size() != 0)
+    if (tmp.total_size() != 0)
     {
         const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src.data_type();
         ARM_COMPUTE_RETURN_ERROR_ON(tmp.data_type() != tmp_data_type);
@@ -243,14 +214,16 @@
 }
 } // namespace
 
-template <bool                                                                       IS_LOG>
-const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
+template <bool IS_LOG>
+const std::vector<typename CpuLogits1DSoftmaxKernel<IS_LOG>::SoftmaxLogits1DKernel> &
+CpuLogits1DSoftmaxKernel<IS_LOG>::get_available_kernels()
 {
     return available_kernels_logits<IS_LOG>;
 }
 
 template <bool IS_LOG>
-void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
+void CpuLogits1DSoftmaxKernel<IS_LOG>::configure(
+    const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
@@ -259,17 +232,21 @@
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src->data_type());
 
     // Output auto initialization if not yet initialized
-    const QuantizationInfo output_quantization = is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG) : dst->quantization_info();
+    const QuantizationInfo output_quantization =
+        is_quantized_asymmetric ? arm_compute::get_softmax_output_quantization_info(src->data_type(), IS_LOG)
+                                : dst->quantization_info();
     auto_init_if_empty(*dst, TensorInfo(*src).set_quantization_info(output_quantization).reset_padding());
 
     // Tmp auto initialization if not yet initialized
     const DataType tmp_data_type = is_quantized_asymmetric ? DataType::F32 : src->data_type();
     auto_init_if_empty(*tmp, TensorInfo(*src).set_data_type(tmp_data_type).reset_padding());
 
-    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+    const auto *uk = CpuLogits1DSoftmaxKernel<IS_LOG>::get_implementation(
+        DataTypeISASelectorData{src->data_type(), CPUInfo::get().get_isa()});
     ARM_COMPUTE_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
-    std::string kernel_name = IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
+    std::string kernel_name =
+        IS_LOG ? std::string("CpuLogits1DLogSoftmaxKernel") : std::string("CpuLogits1DSoftmaxKernel");
 
     _beta       = beta;
     _run_method = uk->ukernel;
@@ -282,8 +259,8 @@
 }
 
 template <bool IS_LOG>
-Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(const ITensorInfo *src, const ITensorInfo *max,
-                                                  const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
+Status CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
+    const ITensorInfo *src, const ITensorInfo *max, const ITensorInfo *dst, const float beta, const ITensorInfo *tmp)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, max, dst, tmp);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_logits_softmax(*src, *max, *dst, beta, *tmp, IS_LOG));
@@ -305,7 +282,7 @@
     auto       tmp = tensors.get_tensor(TensorType::ACL_DST_1);
 
     const unsigned int num_elems_processed_per_iteration = src->info()->valid_region().shape.x();
-    const unsigned int tmp_size_for_thread               = tmp->info()->element_size() * num_elems_processed_per_iteration;
+    const unsigned int tmp_size_for_thread = tmp->info()->element_size() * num_elems_processed_per_iteration;
 
     ARM_COMPUTE_ERROR_ON(tmp->info()->total_size() < (info.num_threads * tmp_size_for_thread));
 
@@ -314,7 +291,7 @@
 }
 
 template <bool IS_LOG>
-const char    *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
+const char *CpuLogits1DSoftmaxKernel<IS_LOG>::name() const
 {
     return _name.c_str();
 }

diff --git a/src/cpu/kernels/CpuSoftmaxKernel.h b/src/cpu/kernels/CpuSoftmaxKernel.h
index 59f43bd..5d28817 100644
--- a/src/cpu/kernels/CpuSoftmaxKernel.h
+++ b/src/cpu/kernels/CpuSoftmaxKernel.h

@@ -57,7 +57,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct SoftmaxLogits1DMaxKernel
@@ -70,7 +70,7 @@
     static const std::vector<SoftmaxLogits1DMaxKernel> &get_available_kernels();
 
 private:
-    SoftmaxLogits1DMaxKernelPtr _run_method{ nullptr };
+    SoftmaxLogits1DMaxKernelPtr _run_method{nullptr};
     std::string                 _name{};
 };
 
@@ -79,7 +79,8 @@
 class CpuLogits1DSoftmaxKernel : public ICpuKernel<CpuLogits1DSoftmaxKernel<IS_LOG>>
 {
 private:
-    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
+    using SoftmaxLogits1DKernelPtr = std::add_pointer<void(
+        const ITensor *, const ITensor *, void *const, ITensor *, float, bool, const Window &)>::type;
 
 public:
     CpuLogits1DSoftmaxKernel() = default;
@@ -95,18 +96,22 @@
      *
      * @param      tmp    Auxiliary tensor info. Must be type F32 and same shape as the input.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *max, ITensorInfo *dst, const float beta, ITensorInfo *tmp);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuLogits1DSoftmaxKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *max,
-                           const ITensorInfo *dst, const float beta, const ITensorInfo *tmp);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *max,
+                           const ITensorInfo *dst,
+                           const float        beta,
+                           const ITensorInfo *tmp);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     struct SoftmaxLogits1DKernel
@@ -119,8 +124,8 @@
     static const std::vector<SoftmaxLogits1DKernel> &get_available_kernels();
 
 private:
-    float                    _beta{ 1.0f };
-    SoftmaxLogits1DKernelPtr _run_method{ nullptr };
+    float                    _beta{1.0f};
+    SoftmaxLogits1DKernelPtr _run_method{nullptr};
     std::string              _name{};
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuSubKernel.cpp b/src/cpu/kernels/CpuSubKernel.cpp
index 875d613..2b2c6f2 100644
--- a/src/cpu/kernels/CpuSubKernel.cpp
+++ b/src/cpu/kernels/CpuSubKernel.cpp

@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CPP/Validate.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/add/generic/neon/impl.h"
@@ -51,70 +52,48 @@
 using CpuSubKernelDataTypeISASelectorData    = CpuAddKernelDataTypeISASelectorData;
 using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
 
-static const std::vector<CpuSubKernel::SubKernel> available_kernels =
-{
-    {
-        "neon_fp32_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
-        REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)
-    },
-    {
-        "neon_fp16_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::F16) && data.isa.fp16; },
-        REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)
-    },
-    {
-        "neon_u8_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::U8); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)
-    },
-    {
-        "neon_s16_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S16); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)
-    },
-    {
-        "neon_s32_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::S32); },
-        REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)
-    },
-    {
-        "neon_qu8_sub_fixedpoint",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)
-    },
-    {
-        "neon_qs8_sub_fixedpoint",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)
-    },
-    {
-        "neon_qu8_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8); },
-        REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)
-    },
-    {
-        "neon_qs8_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QASYMM8_SIGNED); },
-        REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)
-    },
-    {
-        "neon_qs16_sub",
-        [](const CpuSubKernelDataTypeISASelectorData & data) { return (data.dt == DataType::QSYMM16); },
-        REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)
-    },
+static const std::vector<CpuSubKernel::SubKernel> available_kernels = {
+    {"neon_fp32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F32); },
+     REGISTER_FP32_NEON(arm_compute::cpu::sub_same_neon<float>)},
+    {"neon_fp16_sub",
+     [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::F16) && data.isa.fp16; },
+     REGISTER_FP16_NEON(arm_compute::cpu::sub_same_neon<float16_t>)},
+    {"neon_u8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::U8); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<uint8_t>)},
+    {"neon_s16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S16); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int16_t>)},
+    {"neon_s32_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::S32); },
+     REGISTER_INTEGER_NEON(arm_compute::cpu::sub_same_neon<int32_t>)},
+    {"neon_qu8_sub_fixedpoint",
+     [](const CpuSubKernelDataTypeISASelectorData &data)
+     { return ((data.dt == DataType::QASYMM8) && data.can_use_fixedpoint); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon_fixedpoint)},
+    {"neon_qs8_sub_fixedpoint",
+     [](const CpuSubKernelDataTypeISASelectorData &data)
+     { return ((data.dt == DataType::QASYMM8_SIGNED) && data.can_use_fixedpoint); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon_fixedpoint)},
+    {"neon_qu8_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8); },
+     REGISTER_QASYMM8_NEON(arm_compute::cpu::sub_qasymm8_neon)},
+    {"neon_qs8_sub",
+     [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QASYMM8_SIGNED); },
+     REGISTER_QASYMM8_SIGNED_NEON(arm_compute::cpu::sub_qasymm8_signed_neon)},
+    {"neon_qs16_sub", [](const CpuSubKernelDataTypeISASelectorData &data) { return (data.dt == DataType::QSYMM16); },
+     REGISTER_QSYMM16_NEON(arm_compute::cpu::sub_qsymm16_neon)},
 };
 
-inline Status validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
+inline Status
+validate_arguments(const ITensorInfo &src0, const ITensorInfo &src1, const ITensorInfo &dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(&src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src0, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::S16,
+                                                         DataType::S32, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &src1);
 
     const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(&src0, &src1, &dst);
-    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+        CpuSubKernelDataTypeISASelectorData{src0.data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
 
     ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
 
@@ -125,7 +104,7 @@
                                     "Convert policy cannot be WRAP if datatype is quantized");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src0, &dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
@@ -147,7 +126,8 @@
     set_data_type_if_unknown(*dst, src0->data_type());
 
     const auto can_use_fixedpoint = sub_q8_neon_fixedpoint_possible(src0, src1, dst);
-    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(CpuSubKernelDataTypeISASelectorData{ src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint });
+    const auto uk                 = CpuSubKernel::get_implementation<CpuSubKernelDataTypeISASelectorData>(
+        CpuSubKernelDataTypeISASelectorData{src0->data_type(), CPUInfo::get().get_isa(), can_use_fixedpoint});
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(uk);
 
@@ -167,14 +147,14 @@
     ARM_COMPUTE_UNUSED(thread_count);
 
 #if defined(ENABLE_FP32_KERNELS)
-    if(this->_run_method == &sub_same_neon<float>)
+    if (this->_run_method == &sub_same_neon<float>)
     {
         size_t mws = ICPPKernel::default_mws;
-        if(platform.get_cpu_model() == CPUModel::N1)
+        if (platform.get_cpu_model() == CPUModel::N1)
         {
             mws = default_mws_N1_fp32_neon;
         }
-        else if(platform.get_cpu_model() == CPUModel::V1)
+        else if (platform.get_cpu_model() == CPUModel::V1)
         {
             mws = default_mws_V1_fp32_neon;
         }
@@ -184,7 +164,7 @@
         }
 
         // tensor is 1D or was re-interpreted as 1D
-        if(this->window().shape().num_dimensions() == 1)
+        if (this->window().shape().num_dimensions() == 1)
         {
             return mws;
         }
@@ -203,7 +183,8 @@
     return ICPPKernel::default_mws;
 }
 
-Status CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
+Status
+CpuSubKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src0, *src1, *dst, policy));

diff --git a/src/cpu/kernels/CpuSubKernel.h b/src/cpu/kernels/CpuSubKernel.h
index cd209d1..5fa0dc4 100644
--- a/src/cpu/kernels/CpuSubKernel.h
+++ b/src/cpu/kernels/CpuSubKernel.h

@@ -37,7 +37,8 @@
 class CpuSubKernel : public ICpuKernel<CpuSubKernel>
 {
 private:
-    using SubKernelPtr                           = std::add_pointer<void(const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
+    using SubKernelPtr                           = std::add_pointer<void(
+        const ITensor *, const ITensor *, ITensor *, const ConvertPolicy &, const Window &)>::type;
     using CpuSubKernelDataTypeISASelectorDataPtr = CpuAddKernelDataTypeISASelectorDataPtr;
 
 public:
@@ -68,7 +69,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy);
 
     // Inherited methods overridden:
     void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -99,9 +101,9 @@
 
 private:
     ConvertPolicy _policy{};
-    SubKernelPtr  _run_method{ nullptr };
+    SubKernelPtr  _run_method{nullptr};
     std::string   _name{};
-    size_t        _split_dimension{ Window::DimY };
+    size_t        _split_dimension{Window::DimY};
 };
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/CpuTransposeKernel.cpp b/src/cpu/kernels/CpuTransposeKernel.cpp
index b2cebc4..615bc6c 100644
--- a/src/cpu/kernels/CpuTransposeKernel.cpp
+++ b/src/cpu/kernels/CpuTransposeKernel.cpp

@@ -28,8 +28,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -45,7 +46,7 @@
 {
 unsigned int num_elems_processed(size_t element_size)
 {
-    switch(element_size)
+    switch (element_size)
     {
         case 1:
             return 8;
@@ -81,10 +82,10 @@
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -101,87 +102,121 @@
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 8x8 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
-                const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
-                const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
-                const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
-                const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
-                const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
-                const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
-                const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
+                // Compute 8x8 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x8_t row0 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
+                    const uint8x8_t row1 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
+                    const uint8x8_t row2 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
+                    const uint8x8_t row3 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
+                    const uint8x8_t row4 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
+                    const uint8x8_t row5 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
+                    const uint8x8_t row6 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
+                    const uint8x8_t row7 =
+                        vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
 
-                // Transpose 2x2
-                const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
-                const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
-                const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
-                const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
+                    // Transpose 2x2
+                    const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
+                    const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
+                    const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
+                    const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
 
-                // Transpose 4x4
-                const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
-                const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
-                const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
-                const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
+                    // Transpose 4x4
+                    const uint16x4x2_t k0_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
+                    const uint16x4x2_t k1_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
+                    const uint16x4x2_t k2_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
+                    const uint16x4x2_t k3_u16 =
+                        vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
 
-                // Transpose 8x8
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
+                    // Transpose 8x8
+                    const uint32x2x2_t k0_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
+                    const uint32x2x2_t k1_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
+                    const uint32x2x2_t k2_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
+                    const uint32x2x2_t k3_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
 
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
 
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
-                vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
-            }
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
+                    vst1_u8(
+                        reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+                        vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
+                }
 
-            // Compute left-over elements along the x dimension (1x8)
-            for(; x < window_end_x; ++x)
-            {
-                const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
-                const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
-                const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
-                const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
-                const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
-                const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
-                const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
-                const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
+                // Compute left-over elements along the x dimension (1x8)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
+                    const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
+                    const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
+                    const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
+                    const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
+                    const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
+                    const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
+                    const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
 
-                uint8x8_t result = vdup_n_u8(0);
-                result           = vset_lane_u8(val0, result, 0);
-                result           = vset_lane_u8(val1, result, 1);
-                result           = vset_lane_u8(val2, result, 2);
-                result           = vset_lane_u8(val3, result, 3);
-                result           = vset_lane_u8(val4, result, 4);
-                result           = vset_lane_u8(val5, result, 5);
-                result           = vset_lane_u8(val6, result, 6);
-                result           = vset_lane_u8(val7, result, 7);
+                    uint8x8_t result = vdup_n_u8(0);
+                    result           = vset_lane_u8(val0, result, 0);
+                    result           = vset_lane_u8(val1, result, 1);
+                    result           = vset_lane_u8(val2, result, 2);
+                    result           = vset_lane_u8(val3, result, 3);
+                    result           = vset_lane_u8(val4, result, 4);
+                    result           = vset_lane_u8(val5, result, 5);
+                    result           = vset_lane_u8(val6, result, 6);
+                    result           = vset_lane_u8(val7, result, 7);
 
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
 
-                vst1_u8(output.ptr() + dst_offset_in_bytes, result);
-            }
-        },
-        input, output);
+                    vst1_u8(output.ptr() + dst_offset_in_bytes, result);
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -190,16 +225,18 @@
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint8_t val0 = *input.ptr();
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint8_t val0 = *input.ptr();
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
 
-            *(output.ptr() + dst_offset_in_bytes) = val0;
-        },
-        input, output);
+                *(output.ptr() + dst_offset_in_bytes) = val0;
+            },
+            input, output);
     }
 }
 
@@ -220,10 +257,10 @@
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -240,61 +277,77 @@
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                // Compute 4x4 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint16x4_t row0 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint16x4_t row1 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint16x4_t row2 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint16x4_t row3 =
+                        vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
 
-                // Transpose 2x2
-                const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
-                const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
+                    // Transpose 2x2
+                    const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
+                    const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
 
-                // Transpose 4x4
-                const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
+                    // Transpose 4x4
+                    const uint32x2x2_t k0_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
+                    const uint32x2x2_t k1_u32 =
+                        vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
 
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
 
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
-            }
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k0_u32.val[0]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k1_u32.val[0]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k0_u32.val[1]));
+                    vst1_u16(
+                        reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vreinterpret_u16_u32(k1_u32.val[1]));
+                }
 
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
-            {
-                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                // Compute left-over elements (1x4)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
 
-                uint16x4_t result = vdup_n_u16(0);
-                result            = vset_lane_u16(val0, result, 0);
-                result            = vset_lane_u16(val1, result, 1);
-                result            = vset_lane_u16(val2, result, 2);
-                result            = vset_lane_u16(val3, result, 3);
+                    uint16x4_t result = vdup_n_u16(0);
+                    result            = vset_lane_u16(val0, result, 0);
+                    result            = vset_lane_u16(val1, result, 1);
+                    result            = vset_lane_u16(val2, result, 2);
+                    result            = vset_lane_u16(val3, result, 3);
 
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
 
-                vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
+                    vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -303,16 +356,18 @@
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
 
-            *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
+                *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
     }
 }
 
@@ -347,10 +402,10 @@
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -367,102 +422,160 @@
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 8x8 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                // Load
-                const uint32x4x2_t row0 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row1 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row2 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row3 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row4 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row5 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row6 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
-                const uint32x4x2_t row7 = vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+                // Compute 8x8 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load
+                    const uint32x4x2_t row0 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row1 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row2 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row3 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row4 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row5 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row6 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+                    const uint32x4x2_t row7 =
+                        vld1q_u32_x2_(reinterpret_cast<const uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
 
-                // Transpose 2x4
-                const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]), vtrn2q_u32(row0.val[0], row1.val[0])};
-                const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]), vtrn2q_u32(row0.val[1], row1.val[1])};
-                const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]), vtrn2q_u32(row2.val[0], row3.val[0])};
-                const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]), vtrn2q_u32(row2.val[1], row3.val[1])};
-                const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]), vtrn2q_u32(row4.val[0], row5.val[0])};
-                const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]), vtrn2q_u32(row4.val[1], row5.val[1])};
-                const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]), vtrn2q_u32(row6.val[0], row7.val[0])};
-                const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]), vtrn2q_u32(row6.val[1], row7.val[1])};
+                    // Transpose 2x4
+                    const uint32x4x2_t k0_u32 = {vtrn1q_u32(row0.val[0], row1.val[0]),
+                                                 vtrn2q_u32(row0.val[0], row1.val[0])};
+                    const uint32x4x2_t k1_u32 = {vtrn1q_u32(row0.val[1], row1.val[1]),
+                                                 vtrn2q_u32(row0.val[1], row1.val[1])};
+                    const uint32x4x2_t k2_u32 = {vtrn1q_u32(row2.val[0], row3.val[0]),
+                                                 vtrn2q_u32(row2.val[0], row3.val[0])};
+                    const uint32x4x2_t k3_u32 = {vtrn1q_u32(row2.val[1], row3.val[1]),
+                                                 vtrn2q_u32(row2.val[1], row3.val[1])};
+                    const uint32x4x2_t k4_u32 = {vtrn1q_u32(row4.val[0], row5.val[0]),
+                                                 vtrn2q_u32(row4.val[0], row5.val[0])};
+                    const uint32x4x2_t k5_u32 = {vtrn1q_u32(row4.val[1], row5.val[1]),
+                                                 vtrn2q_u32(row4.val[1], row5.val[1])};
+                    const uint32x4x2_t k6_u32 = {vtrn1q_u32(row6.val[0], row7.val[0]),
+                                                 vtrn2q_u32(row6.val[0], row7.val[0])};
+                    const uint32x4x2_t k7_u32 = {vtrn1q_u32(row6.val[1], row7.val[1]),
+                                                 vtrn2q_u32(row6.val[1], row7.val[1])};
 
-                // Transpose 2x2
-                const uint64x2x2_t k0_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
-                const uint64x2x2_t k1_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
-                const uint64x2x2_t k2_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
-                const uint64x2x2_t k3_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
-                const uint64x2x2_t k4_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
-                const uint64x2x2_t k5_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
-                const uint64x2x2_t k6_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
-                const uint64x2x2_t k7_u64 = {vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])), vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
+                    // Transpose 2x2
+                    const uint64x2x2_t k0_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[0]), vreinterpretq_u64_u32(k2_u32.val[0]))};
+                    const uint64x2x2_t k1_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k0_u32.val[1]), vreinterpretq_u64_u32(k2_u32.val[1]))};
+                    const uint64x2x2_t k2_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[0]), vreinterpretq_u64_u32(k3_u32.val[0]))};
+                    const uint64x2x2_t k3_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k1_u32.val[1]), vreinterpretq_u64_u32(k3_u32.val[1]))};
+                    const uint64x2x2_t k4_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[0]), vreinterpretq_u64_u32(k6_u32.val[0]))};
+                    const uint64x2x2_t k5_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k4_u32.val[1]), vreinterpretq_u64_u32(k6_u32.val[1]))};
+                    const uint64x2x2_t k6_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[0]), vreinterpretq_u64_u32(k7_u32.val[0]))};
+                    const uint64x2x2_t k7_u64 = {
+                        vtrn1q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1])),
+                        vtrn2q_u64(vreinterpretq_u64_u32(k5_u32.val[1]), vreinterpretq_u64_u32(k7_u32.val[1]))};
 
-                // Swap blocks
-                const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]), vreinterpretq_u32_u64(k4_u64.val[0])};
-                const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]), vreinterpretq_u32_u64(k5_u64.val[0])};
-                const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]), vreinterpretq_u32_u64(k4_u64.val[1])};
-                const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]), vreinterpretq_u32_u64(k5_u64.val[1])};
-                const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]), vreinterpretq_u32_u64(k6_u64.val[0])};
-                const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]), vreinterpretq_u32_u64(k7_u64.val[0])};
-                const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]), vreinterpretq_u32_u64(k6_u64.val[1])};
-                const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]), vreinterpretq_u32_u64(k7_u64.val[1])};
+                    // Swap blocks
+                    const uint32x4x2_t col0 = {vreinterpretq_u32_u64(k0_u64.val[0]),
+                                               vreinterpretq_u32_u64(k4_u64.val[0])};
+                    const uint32x4x2_t col1 = {vreinterpretq_u32_u64(k1_u64.val[0]),
+                                               vreinterpretq_u32_u64(k5_u64.val[0])};
+                    const uint32x4x2_t col2 = {vreinterpretq_u32_u64(k0_u64.val[1]),
+                                               vreinterpretq_u32_u64(k4_u64.val[1])};
+                    const uint32x4x2_t col3 = {vreinterpretq_u32_u64(k1_u64.val[1]),
+                                               vreinterpretq_u32_u64(k5_u64.val[1])};
+                    const uint32x4x2_t col4 = {vreinterpretq_u32_u64(k2_u64.val[0]),
+                                               vreinterpretq_u32_u64(k6_u64.val[0])};
+                    const uint32x4x2_t col5 = {vreinterpretq_u32_u64(k3_u64.val[0]),
+                                               vreinterpretq_u32_u64(k7_u64.val[0])};
+                    const uint32x4x2_t col6 = {vreinterpretq_u32_u64(k2_u64.val[1]),
+                                               vreinterpretq_u32_u64(k6_u64.val[1])};
+                    const uint32x4x2_t col7 = {vreinterpretq_u32_u64(k3_u64.val[1]),
+                                               vreinterpretq_u32_u64(k7_u64.val[1])};
 
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
 
-                // Store
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), col0);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), col1);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), col2);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), col3);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), col4);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), col5);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), col6);
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), col7);
-            }
+                    // Store
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        col0);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        col1);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        col2);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        col3);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes),
+                        col4);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes),
+                        col5);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes),
+                        col6);
+                    vst1q_u32_x2_(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes),
+                        col7);
+                }
 
-            // Compute left-over elements (8x1)
-            for(; x < window_end_x; ++x)
-            {
-                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
-                const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
-                const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
-                const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
-                const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
+                // Compute left-over elements (8x1)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                    const uint32_t val4 = *(reinterpret_cast<uint32_t *>(input.ptr() + 4 * input_stride_in_bytes) + x);
+                    const uint32_t val5 = *(reinterpret_cast<uint32_t *>(input.ptr() + 5 * input_stride_in_bytes) + x);
+                    const uint32_t val6 = *(reinterpret_cast<uint32_t *>(input.ptr() + 6 * input_stride_in_bytes) + x);
+                    const uint32_t val7 = *(reinterpret_cast<uint32_t *>(input.ptr() + 7 * input_stride_in_bytes) + x);
 
-                uint32x4_t result0 = vdupq_n_u32(0);
-                uint32x4_t result1 = vdupq_n_u32(0);
-                result0            = vsetq_lane_u32(val0, result0, 0);
-                result0            = vsetq_lane_u32(val1, result0, 1);
-                result0            = vsetq_lane_u32(val2, result0, 2);
-                result0            = vsetq_lane_u32(val3, result0, 3);
-                result1            = vsetq_lane_u32(val4, result1, 0);
-                result1            = vsetq_lane_u32(val5, result1, 1);
-                result1            = vsetq_lane_u32(val6, result1, 2);
-                result1            = vsetq_lane_u32(val7, result1, 3);
+                    uint32x4_t result0 = vdupq_n_u32(0);
+                    uint32x4_t result1 = vdupq_n_u32(0);
+                    result0            = vsetq_lane_u32(val0, result0, 0);
+                    result0            = vsetq_lane_u32(val1, result0, 1);
+                    result0            = vsetq_lane_u32(val2, result0, 2);
+                    result0            = vsetq_lane_u32(val3, result0, 3);
+                    result1            = vsetq_lane_u32(val4, result1, 0);
+                    result1            = vsetq_lane_u32(val5, result1, 1);
+                    result1            = vsetq_lane_u32(val6, result1, 2);
+                    result1            = vsetq_lane_u32(val7, result1, 3);
 
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
 
-                vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
-            }
-        },
-        input, output);
+                    vst1q_u32_x2_(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), {result0, result1});
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -471,40 +584,42 @@
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
 
-            *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
+                *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
     }
 }
 #else  // __aarch64__
 void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
 {
-    const int    window_step_x            = 4;
-    const int    window_step_y            = 4;
-    const int    window_start_x           = window.x().start();
-    const int    window_end_x             = window.x().end();
-    const int    window_start_y           = window.y().start();
-    const int    window_end_y             = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
-    const int    window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
-    const size_t input_stride_in_bytes    = in->info()->strides_in_bytes()[1];
-    const size_t output_stride_in_bytes   = out->info()->strides_in_bytes()[1];
+    const int window_step_x = 4;
+    const int window_step_y = 4;
+    const int window_start_x = window.x().start();
+    const int window_end_x = window.x().end();
+    const int window_start_y = window.y().start();
+    const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
+    const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
+    const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
+    const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
 
     // Check if we need a left-over loop for the y dimension
     bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
 
     Window window_in(window);
     window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         // Check if window_end_y_multiple_of is greater than window_start_y
-        if(window_end_y_multiple_of > window_start_y)
+        if (window_end_y_multiple_of > window_start_y)
         {
             window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
         }
@@ -521,60 +636,74 @@
     Iterator output(out, window_out);
 
     // Run the SIMD path if and only if the input is not a row-vector
-    if(in->info()->dimension(1) != 1)
+    if (in->info()->dimension(1) != 1)
     {
         Iterator input(in, window_in);
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            // Compute 4x4 elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
             {
-                const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                // Compute 4x4 elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint32x4_t row0 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32x4_t row1 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32x4_t row2 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32x4_t row3 =
+                        vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
 
-                // Transpose 2x2
-                const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
-                const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
-                const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
-                const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
+                    // Transpose 2x2
+                    const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
+                    const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
+                    const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
+                    const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
 
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
 
-                // Swap block 01 with block 10 and store
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
-            }
+                    // Swap block 01 with block 10 and store
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes),
+                        vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes),
+                        vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes),
+                        vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
+                    vst1q_u32(
+                        reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes),
+                        vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
+                }
 
-            // Compute left-over elements (1x4)
-            for(; x < window_end_x; ++x)
-            {
-                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
-                const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
-                const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
-                const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
+                // Compute left-over elements (1x4)
+                for (; x < window_end_x; ++x)
+                {
+                    const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
+                    const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
+                    const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
+                    const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
 
-                uint32x4_t result = vdupq_n_u32(0);
-                result            = vsetq_lane_u32(val0, result, 0);
-                result            = vsetq_lane_u32(val1, result, 1);
-                result            = vsetq_lane_u32(val2, result, 2);
-                result            = vsetq_lane_u32(val3, result, 3);
+                    uint32x4_t result = vdupq_n_u32(0);
+                    result = vsetq_lane_u32(val0, result, 0);
+                    result = vsetq_lane_u32(val1, result, 1);
+                    result = vsetq_lane_u32(val2, result, 2);
+                    result = vsetq_lane_u32(val3, result, 3);
 
-                // Compute destination address
-                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
+                    // Compute destination address
+                    const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
 
-                vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
-            }
-        },
-        input, output);
+                    vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
+                }
+            },
+            input, output);
     }
 
-    if(left_over_loop_y)
+    if (left_over_loop_y)
     {
         window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
         window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
@@ -583,16 +712,18 @@
         Iterator output(out, window_out);
 
         // Compute left-over elements along the y dimension (1x1)
-        execute_window_loop(window_in, [&](const Coordinates & id)
-        {
-            const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
+        execute_window_loop(
+            window_in,
+            [&](const Coordinates &id)
+            {
+                const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
 
-            // Compute destination address
-            const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
+                // Compute destination address
+                const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
 
-            *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
-        },
-        input, output);
+                *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
+            },
+            input, output);
     }
 }
 #endif // __aarch64__
@@ -616,7 +747,8 @@
     const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->element_size());
 
     // Configure kernel window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // The CpuTranspose doesn't need padding so update_window_and_padding() can be skipped
     Coordinates coord;
@@ -637,7 +769,7 @@
                                     "Element size not supported");
 
     // Validate configured destination
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src);
 
@@ -658,7 +790,7 @@
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
     auto       dst = tensors.get_tensor(TensorType::ACL_DST);
 
-    switch(src->info()->element_size())
+    switch (src->info()->element_size())
     {
         case 1:
             transpose_8bit_elements(src, dst, window);

diff --git a/src/cpu/kernels/CpuTransposeKernel.h b/src/cpu/kernels/CpuTransposeKernel.h
index cb85dae..e79a405 100644
--- a/src/cpu/kernels/CpuTransposeKernel.h
+++ b/src/cpu/kernels/CpuTransposeKernel.h

@@ -54,7 +54,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
index 2ccc977..297ba63 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.cpp
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
@@ -38,7 +39,7 @@
 {
 TensorShape get_output_shape(const ITensorInfo *src, bool has_bias)
 {
-    TensorShape output_shape{ src->tensor_shape() };
+    TensorShape output_shape{src->tensor_shape()};
 
     output_shape.collapse(3);
     const size_t tmp_dim = output_shape[0];
@@ -54,20 +55,22 @@
     //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
         ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1));
         ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2));
         ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4]));
+        ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] ||
+                                                                     biases->dimension(1) != src->tensor_shape()[4]));
     }
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
+                                                           get_output_shape(src, biases != nullptr));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -84,9 +87,7 @@
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr))));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
-                                                  biases,
-                                                  dst));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst));
 
     // Configure kernel
     Window window = calculate_max_window(*src, Steps());
@@ -122,44 +123,47 @@
 
     // Create iterators
     Iterator in(src, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        // Get column index
-        const int kernel_idx = id[3];
-        const int kernel_idz = id[4];
-
-        // Setup pointers
-        const uint8_t *tmp_input_ptr        = in.ptr();
-        uint8_t       *tmp_output_ptr       = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
-        const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
-        const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
-        // Linearize volume
-        for(unsigned int d = 0; d < kernel_depth; ++d)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            for(unsigned int j = 0; j < kernel_size_y; ++j)
+            // Get column index
+            const int kernel_idx = id[3];
+            const int kernel_idz = id[4];
+
+            // Setup pointers
+            const uint8_t *tmp_input_ptr        = in.ptr();
+            uint8_t       *tmp_output_ptr       = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+            const uint8_t *curr_input_row_ptr   = tmp_input_ptr;
+            const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+            // Linearize volume
+            for (unsigned int d = 0; d < kernel_depth; ++d)
             {
-                for(unsigned int i = 0; i < kernel_size_x; ++i)
+                for (unsigned int j = 0; j < kernel_size_y; ++j)
                 {
-                    std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
-                    tmp_input_ptr += input_stride_x;
-                    tmp_output_ptr += output_stride_y;
+                    for (unsigned int i = 0; i < kernel_size_x; ++i)
+                    {
+                        std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
+                        tmp_input_ptr += input_stride_x;
+                        tmp_output_ptr += output_stride_y;
+                    }
+                    curr_input_row_ptr += input_stride_y;
+                    tmp_input_ptr = curr_input_row_ptr;
                 }
-                curr_input_row_ptr += input_stride_y;
-                tmp_input_ptr = curr_input_row_ptr;
+                curr_input_depth_ptr += input_stride_z;
+                curr_input_row_ptr = curr_input_depth_ptr;
+                tmp_input_ptr      = curr_input_depth_ptr;
             }
-            curr_input_depth_ptr += input_stride_z;
-            curr_input_row_ptr = curr_input_depth_ptr;
-            tmp_input_ptr      = curr_input_depth_ptr;
-        }
 
-        // Add bias
-        if(biases != nullptr)
-        {
-            std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size());
-        }
-    },
-    in);
+            // Add bias
+            if (biases != nullptr)
+            {
+                std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)),
+                            src->info()->element_size());
+            }
+        },
+        in);
 }
 const char *CpuWeightsReshapeKernel::name() const
 {
@@ -167,4 +171,4 @@
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuWeightsReshapeKernel.h b/src/cpu/kernels/CpuWeightsReshapeKernel.h
index 1a260ed..9310b3c 100644
--- a/src/cpu/kernels/CpuWeightsReshapeKernel.h
+++ b/src/cpu/kernels/CpuWeightsReshapeKernel.h

@@ -82,7 +82,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 };
 } // namespace kernels

diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
index 818d878..52e3f25 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.cpp
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.cpp

@@ -28,8 +28,10 @@
 {
 namespace cpu
 {
-CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads)
-    : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads }
+CpuWinogradConv2dTransformInputKernel::CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                                                             arm_conv::ConvolutionArgs        &_c_args,
+                                                                             uint32_t                          nthreads)
+    : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
 {
 }
 
@@ -49,24 +51,20 @@
     const size_t input_row_stride   = src_strides[height_idx] / element_size_in_bytes;
     const size_t input_col_stride   = src_strides[width_idx] / element_size_in_bytes;
     const size_t input_batch_stride = src_strides[batch_idx] / element_size_in_bytes;
-    const auto   input_nhwc_ptr     = reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
-    auto         win_transf_ptr     = reinterpret_cast<void *>(winograd_input_transform->buffer() + winograd_input_transform->info()->offset_first_element_in_bytes());
+    const auto   input_nhwc_ptr =
+        reinterpret_cast<const void *>(input_nhwc->buffer() + input_nhwc->info()->offset_first_element_in_bytes());
+    auto win_transf_ptr = reinterpret_cast<void *>(winograd_input_transform->buffer() +
+                                                   winograd_input_transform->info()->offset_first_element_in_bytes());
 
-    _winograd_impl.input_transform->execute(
-        _conv_args,
-        input_nhwc_ptr,
-        input_batch_stride,
-        input_row_stride,
-        input_col_stride,
-        win_transf_ptr,
-        _winograd_impl.winograd_spec,
-        workspace->buffer(),
-        info.thread_id,
-        _nthreads);
+    _winograd_impl.input_transform->execute(_conv_args, input_nhwc_ptr, input_batch_stride, input_row_stride,
+                                            input_col_stride, win_transf_ptr, _winograd_impl.winograd_spec,
+                                            workspace->buffer(), info.thread_id, _nthreads);
 }
 
-CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads)
-    : _winograd_impl{ w_impl }, _conv_args{ _c_args }, _nthreads{ nthreads }
+CpuWinogradConv2dTransformOutputKernel::CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                                                               arm_conv::ConvolutionArgs &_c_args,
+                                                                               uint32_t                   nthreads)
+    : _winograd_impl{w_impl}, _conv_args{_c_args}, _nthreads{nthreads}
 {
 }
 
@@ -88,28 +86,21 @@
     const size_t out_row_stride   = dst_strides[height_idx] / element_size_in_bytes;
     const size_t out_col_stride   = dst_strides[width_idx] / element_size_in_bytes;
     const size_t out_batch_stride = dst_strides[batch_idx] / element_size_in_bytes;
-    const auto   wout_transf_ptr  = reinterpret_cast<const void *>(winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes());
-    auto         dst_nhwc_ptr     = reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes());
-    void        *biases_data_ptr  = nullptr;
-    if(biases != nullptr)
+    const auto   wout_transf_ptr  = reinterpret_cast<const void *>(
+        winograd_output_transform->buffer() + winograd_output_transform->info()->offset_first_element_in_bytes());
+    auto dst_nhwc_ptr =
+        reinterpret_cast<void *>(dst_nhwc->buffer() + dst_nhwc->info()->offset_first_element_in_bytes());
+    void *biases_data_ptr = nullptr;
+    if (biases != nullptr)
     {
         biases_data_ptr = reinterpret_cast<void *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
     }
 
     // Output transform
-    _winograd_impl.output_transform->execute(
-        _conv_args,
-        wout_transf_ptr,
-        _winograd_impl.winograd_spec,
-        biases_data_ptr,
-        dst_nhwc_ptr,
-        out_batch_stride,
-        out_row_stride,
-        out_col_stride,
-        workspace->buffer(),
-        info.thread_id,
-        _nthreads);
+    _winograd_impl.output_transform->execute(_conv_args, wout_transf_ptr, _winograd_impl.winograd_spec, biases_data_ptr,
+                                             dst_nhwc_ptr, out_batch_stride, out_row_stride, out_col_stride,
+                                             workspace->buffer(), info.thread_id, _nthreads);
 }
 
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/CpuWinogradConv2dKernel.h b/src/cpu/kernels/CpuWinogradConv2dKernel.h
index 0170dca..8a3b745 100644
--- a/src/cpu/kernels/CpuWinogradConv2dKernel.h
+++ b/src/cpu/kernels/CpuWinogradConv2dKernel.h

@@ -30,6 +30,7 @@
 #include "arm_compute/core/Steps.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/NEON/kernels/assembly/winograd.hpp"
 #include "src/core/NEON/kernels/convolution/common/tensor.hpp"
 #include "src/cpu/ICpuKernel.h"
@@ -53,7 +54,9 @@
     /**  Prevent instances of this class from being moved it contains references.*/
     CpuWinogradConv2dTransformInputKernel &operator=(CpuWinogradConv2dTransformInputKernel &&) = delete;
 
-    CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads);
+    CpuWinogradConv2dTransformInputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                          arm_conv::ConvolutionArgs        &_c_args,
+                                          uint32_t                          nthreads);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -83,7 +86,9 @@
     /**  Prevent instances of this class from being moved it contains references.*/
     CpuWinogradConv2dTransformOutputKernel &operator=(CpuWinogradConv2dTransformOutputKernel &&) = delete;
 
-    CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl, arm_conv::ConvolutionArgs &_c_args, uint32_t nthreads);
+    CpuWinogradConv2dTransformOutputKernel(arm_conv::winograd::WinogradImpl &w_impl,
+                                           arm_conv::ConvolutionArgs        &_c_args,
+                                           uint32_t                          nthreads);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
@@ -95,7 +100,7 @@
 
 private:
     arm_conv::winograd::WinogradImpl &_winograd_impl;
-    const arm_conv::ConvolutionArgs &_conv_args;
+    const arm_conv::ConvolutionArgs  &_conv_args;
     uint32_t                          _nthreads;
 };
 

diff --git a/src/cpu/kernels/activation/generic/neon/fp16.cpp b/src/cpu/kernels/activation/generic/neon/fp16.cpp
index e51b5b3..ddc6dc2 100644
--- a/src/cpu/kernels/activation/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/fp16.cpp

@@ -31,7 +31,7 @@
 {
 namespace
 {
-constexpr ActFpImplParams Fp16Params = { static_cast<float16_t>(1e-7), 8 };
+constexpr ActFpImplParams Fp16Params = {static_cast<float16_t>(1e-7), 8};
 } // namespace
 
 void neon_fp16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
@@ -40,4 +40,4 @@
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/activation/generic/neon/fp32.cpp b/src/cpu/kernels/activation/generic/neon/fp32.cpp
index 2a3b8a0..e558f8c 100644
--- a/src/cpu/kernels/activation/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/neon/fp32.cpp

@@ -29,7 +29,7 @@
 {
 namespace
 {
-constexpr ActFpImplParams Fp32Params = { static_cast<float>(1e-24), 4 };
+constexpr ActFpImplParams Fp32Params = {static_cast<float>(1e-24), 4};
 } // namespace
 void neon_fp32_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {

diff --git a/src/cpu/kernels/activation/generic/neon/impl.h b/src/cpu/kernels/activation/generic/neon/impl.h
index 05885d8..afeb6f7 100644
--- a/src/cpu/kernels/activation/generic/neon/impl.h
+++ b/src/cpu/kernels/activation/generic/neon/impl.h

@@ -24,6 +24,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
@@ -56,10 +57,14 @@
 #endif /* __aarch64__ */
 
 template <typename T, const ActFpImplParams &P>
-void fp_neon_activation_impl(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void fp_neon_activation_impl(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     /** SIMD vector tag type. */
-    using ExactTagType                                           = typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
+    using ExactTagType =
+        typename arm_compute::wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
     constexpr int                                 window_step_x  = P.step_x;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -72,12 +77,12 @@
     // to prevent NAN values caused by zeros in inputs to SQRT.
     // In case of aarh64, we call vsqrt directly, so we don't use delta.
 #ifndef __aarch64__
-    const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType {});
+    const auto delta = wrapper::vdup_n(static_cast<T>(P.delta), ExactTagType{});
 #else  /* #ifndef __aarch64__ */
-    const auto const_inv_2      = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType {});
+    const auto const_inv_2      = wrapper::vdup_n(static_cast<T>(0.5f), ExactTagType{});
     const auto const_inv_sqrt_2 = wrapper::vdup_n(static_cast<T>(0.70710678118f), ExactTagType{});
 #endif /* __aarch64__ */
-    const auto      const_1           = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType {});
+    const auto      const_1           = wrapper::vdup_n(static_cast<T>(1.f), ExactTagType{});
     const auto      const_0           = wrapper::vdup_n(static_cast<T>(0.f), ExactTagType{});
     const auto      const_6           = wrapper::vdup_n(static_cast<T>(6.f), ExactTagType{});
     const auto      const_3           = wrapper::vdup_n(static_cast<T>(3.f), ExactTagType{});
@@ -88,143 +93,154 @@
     const auto      vb                = wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{});
     const auto      a                 = static_cast<T>(act_info.a());
     const auto      b                 = static_cast<T>(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-        wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            switch(act)
+            const auto input_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
+            wrapper::traits::neon_bitvector_t<T, wrapper::traits::BitWidth::W128> tmp;
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = wrapper::vabs(vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = wrapper::vmla(vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = wrapper::vmax(const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin, wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin, wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = wrapper::vabs(vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = wrapper::vmla(vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = wrapper::vmax(const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = wrapper::vmin(va, wrapper::vmax(const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = wrapper::vmin(va, wrapper::vmax(vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = wrapper::vbsl(wrapper::vcgt(vin, const_0), vin, wrapper::vmul(va, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = wrapper::vbsl(wrapper::vcgt(vin, vsoft_relu_thresh), vin,
+                                            wrapper::vlog(wrapper::vadd(const_1, wrapper::vexpq(vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = wrapper::vbsl(wrapper::vcge(vin, const_0), vin,
+                                            wrapper::vmul(va, wrapper::vsub(wrapper::vexpq(vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
 #ifdef __aarch64__
-                    tmp = wrapper::vsqrt(vin);
+                        tmp = wrapper::vsqrt(vin);
 #else  /* __aarch64__ */
                     {
                         const auto bitmask = wrapper::vceq(vin, wrapper::vdup_n(0.f, ExactTagType{}));
-                        tmp                 = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
-                        tmp                 = mask_float_vector(tmp, wrapper::vnot(bitmask));
+                        tmp = wrapper::vinv(wrapper::vinvsqrt(wrapper::vadd(vin, mask_float_vector(delta, bitmask))));
+                        tmp = mask_float_vector(tmp, wrapper::vnot(bitmask));
                     }
 #endif /* __aarch64__ */
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = wrapper::vmul(vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_6, wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
-                    break;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = wrapper::vmul(vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = wrapper::vmul(va, wrapper::vtanh(wrapper::vmul(vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = wrapper::vmul(
+                            vin,
+                            wrapper::vmul(const_inv_6,
+                                          wrapper::vmin(const_6, wrapper::vmax(const_0, wrapper::vadd(vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = wrapper::vmul(vin, wrapper::vinv(wrapper::vadd(
+                                                     const_1, wrapper::vexpq(wrapper::vneg(wrapper::vmul(va, vin))))));
+                        break;
 #ifdef __aarch64__
-                case ActivationLayerInfo::ActivationFunction::GELU:
-                    tmp = wrapper::vmul(vin, wrapper::vmul(const_inv_2, wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
-                    break;
+                    case ActivationLayerInfo::ActivationFunction::GELU:
+                        tmp = wrapper::vmul(
+                            vin,
+                            wrapper::vmul(const_inv_2,
+                                          wrapper::vadd(const_1, wrapper::verf(wrapper::vmul(vin, const_inv_sqrt_2)))));
+                        break;
 #endif /* __aarch64__ */
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T in = *(reinterpret_cast<const T *>(input_ptr + x));
-            T       tmp;
-            switch(act)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = std::abs(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = a * in + b;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = std::max<T>(static_cast<T>(0), in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = std::min<T>(a, std::max<T>(b, in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = (in > 0) ? in : a * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = std::sqrt(in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = in * in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = a * std::tanh(b * in);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = in;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = in / (static_cast<T>(1) + std::exp(-a * in));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::GELU:
-                    tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                const T in = *(reinterpret_cast<const T *>(input_ptr + x));
+                T       tmp;
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = std::abs(in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = a * in + b;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = static_cast<T>(1) / (static_cast<T>(1) + std::exp(-in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = std::max<T>(static_cast<T>(0), in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = std::min<T>(a, std::max(static_cast<T>(0), in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = std::min<T>(a, std::max<T>(b, in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = (in > 0) ? in : a * in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = (in > soft_relu_thresh) ? in : std::log(static_cast<T>(1) + std::exp(in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = (in >= 0) ? in : a * (std::exp(in) - 1);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = std::sqrt(in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = in * in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = a * std::tanh(b * in);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = in;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = in * ((std::min(std::max((in + 3), 0.0f), 6.0f)) * 0.166666667f);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = in / (static_cast<T>(1) + std::exp(-a * in));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::GELU:
+                        tmp = in * static_cast<T>(0.5f * (1.0f + erff(static_cast<float>(in) / 1.41421356237f)));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/activation/generic/neon/lut.cpp b/src/cpu/kernels/activation/generic/neon/lut.cpp
index c973e96..f289c80 100644
--- a/src/cpu/kernels/activation/generic/neon/lut.cpp
+++ b/src/cpu/kernels/activation/generic/neon/lut.cpp

@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
@@ -33,19 +34,22 @@
 #ifdef __aarch64__
 void neon_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
+                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        auto       output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-        lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
-    },
-    input, output);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            auto       output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+            lut_u8_neon(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+        },
+        input, output);
 }
 #endif // __aarch64__
 } // namespace cpu

diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
index e7c146e..1451301 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8.cpp

@@ -25,6 +25,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -38,7 +39,10 @@
 {
 namespace cpu
 {
-void neon_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     constexpr int                                 window_step_x  = 16;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
@@ -85,206 +89,222 @@
     float32x4_t vs = vdupq_n_f32(s);
     float32x4_t vo = vdupq_n_f32(o);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            const auto input_ptr  = reinterpret_cast<const qasymm8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qasymm8_t *>(output.ptr());
+
+            wrapper::traits::neon_bitvector_t<qasymm8_t, wrapper::traits::BitWidth::W128> tmp;
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                // Perform activation
-                tmp = vmaxq_u8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_u8(va, vmaxq_u8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
                 {
-                    {
+                    // Perform activation
+                    tmp = vmaxq_u8(vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_u8(va, vmaxq_u8(vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_u8(va, vmaxq_u8(vb, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
                 {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(
+                            vin_deq.val[0],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[1],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[2],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[3],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    const auto vin_deq = vdequantize(vin, qi_in);
 
-                const uint32x4x4_t pos_mask =
-                {
-                    {
+                    const uint32x4x4_t pos_mask = {{
                         wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
+                    }};
 
-                const float32x4x4_t tmp_dep =
-                {
-                    {
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
                         wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
                         wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
                         wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
+                    }};
 
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #else  // #ifndef __aarch64__
-            else if (act == ActivationLayerInfo::ActivationFunction::GELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::GELU)
                 {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[0], const_inv_sqrt_2))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[1], const_inv_sqrt_2))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[2], const_inv_sqrt_2))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_2, wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(vin_deq.val[3], const_inv_sqrt_2))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize(tmp_dep, qi_out);
-            }
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(vin_deq.val[0],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[0], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[1],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[1], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[2],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[2], const_inv_sqrt_2))))),
+                        wrapper::vmul(vin_deq.val[3],
+                                      wrapper::vmul(const_inv_2,
+                                                    wrapper::vadd(vconst_1, wrapper::verf(wrapper::vmul(
+                                                                                vin_deq.val[3], const_inv_sqrt_2))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize(tmp_dep, qi_out);
+                }
 #endif // __aarch64__
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
-            qasymm8_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
-            }
+                qasymm8_t in  = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+                qasymm8_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    tmp = std::max(const_0, in);
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(const_0, in));
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(b, in));
+                    tmp = utility::clamp<int32_t, qasymm8_t>(support::cpp11::lround(tmp * s + o));
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::GELU)
-            {
-                float tmp_f = dequantize_qasymm8(in, qi_in);
-                tmp         = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
-                tmp         = quantize_qasymm8(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::GELU)
+                {
+                    float tmp_f = dequantize_qasymm8(in, qi_in);
+                    tmp         = tmp_f * 0.5f * (1.0f + std::erff(in / 1.41421356237f));
+                    tmp         = quantize_qasymm8(tmp_f, qi_out);
+                }
 #endif // __aarch64__
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
index 52c3964..a2f5882 100644
--- a/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qasymm8_signed.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -36,7 +37,10 @@
 {
 namespace cpu
 {
-void neon_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qasymm8_signed_activation(const ITensor             *src,
+                                    ITensor                   *dst,
+                                    const ActivationLayerInfo &act_info,
+                                    const Window              &window)
 {
     constexpr int                                 window_step_x  = 16;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
@@ -76,191 +80,195 @@
     float32x4_t vs = vdupq_n_f32(s);
     float32x4_t vo = vdupq_n_f32(o);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            const auto input_ptr  = reinterpret_cast<const qasymm8_signed_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qasymm8_signed_t *>(output.ptr());
+
+            wrapper::traits::neon_bitvector_t<qasymm8_signed_t, wrapper::traits::BitWidth::W128> tmp;
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                // Perform activation
-                tmp = vmaxq_s8(vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = vminq_s8(va, vmaxq_s8(vb, vin));
-                // Re-quantize to new output space
-                tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
-            }
-#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
                 {
-                    {
+                    // Perform activation
+                    tmp = vmaxq_s8(vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_s8(va, vmaxq_s8(vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = vminq_s8(va, vmaxq_s8(vb, vin));
+                    // Re-quantize to new output space
+                    tmp = vmlaq_qasymm8_signed<RoundingPolicy::TO_NEAREST_UP>(tmp, vs, vo);
+                }
+#ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[2])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[3])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[2], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[3], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize(vin, qi_in);
-                // Perform activation
-                const float32x4x4_t tmp_dep =
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
                 {
-                    {
-                        wrapper::vmul(vin_deq.val[0], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[1], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[2], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
-                        wrapper::vmul(vin_deq.val[3], wrapper::vmul(const_inv_6_f32, wrapper::vmin(const_6_f32, wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_signed(tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                const auto vin_deq = vdequantize(vin, qi_in);
+                    // De-quantize
+                    const auto vin_deq = vdequantize(vin, qi_in);
+                    // Perform activation
+                    const float32x4x4_t tmp_dep = {{
+                        wrapper::vmul(
+                            vin_deq.val[0],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[0], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[1],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[1], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[2],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[2], const_3_f32))))),
+                        wrapper::vmul(
+                            vin_deq.val[3],
+                            wrapper::vmul(
+                                const_inv_6_f32,
+                                wrapper::vmin(const_6_f32,
+                                              wrapper::vmax(const_0_f32, wrapper::vadd(vin_deq.val[3], const_3_f32))))),
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    const auto vin_deq = vdequantize(vin, qi_in);
 
 #ifdef __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
+                    const uint32x4x4_t pos_mask = {{
                         wrapper::vcgtz(vin_deq.val[0]),
                         wrapper::vcgtz(vin_deq.val[1]),
                         wrapper::vcgtz(vin_deq.val[2]),
                         wrapper::vcgtz(vin_deq.val[3]),
-                    }
-                };
+                    }};
 #else  // __aarch64__
-                const uint32x4x4_t pos_mask =
-                {
-                    {
+                    const uint32x4x4_t pos_mask = {{
                         wrapper::vcgt(vin_deq.val[0], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[1], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[2], vconst_0_f32),
                         wrapper::vcgt(vin_deq.val[3], vconst_0_f32),
-                    }
-                };
+                    }};
 #endif // __aarch64__
 
-                const float32x4x4_t tmp_dep =
-                {
-                    {
+                    const float32x4x4_t tmp_dep = {{
                         wrapper::vbsl(pos_mask.val[0], vin_deq.val[0], wrapper::vmul(va_f32, vin_deq.val[0])),
                         wrapper::vbsl(pos_mask.val[1], vin_deq.val[1], wrapper::vmul(va_f32, vin_deq.val[1])),
                         wrapper::vbsl(pos_mask.val[2], vin_deq.val[2], wrapper::vmul(va_f32, vin_deq.val[2])),
                         wrapper::vbsl(pos_mask.val[3], vin_deq.val[3], wrapper::vmul(va_f32, vin_deq.val[3])),
-                    }
-                };
+                    }};
 
-                tmp = vquantize_signed(tmp_dep, qi_out);
+                    tmp = vquantize_signed(tmp_dep, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
-            qasymm8_signed_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                tmp = std::max(const_0, in);
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(const_0, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                tmp = std::min(a, std::max(b, in));
-                tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
-            }
+                qasymm8_signed_t in  = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+                qasymm8_signed_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
+                {
+                    tmp = std::max(const_0, in);
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(const_0, in));
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    tmp = std::min(a, std::max(b, in));
+                    tmp = utility::clamp<int32_t, qasymm8_signed_t>(support::cpp11::lround(tmp * s + o));
+                }
 #ifndef __aarch64__ // LUT-based implementation is used for aarch64 instead.
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
 #endif // __aarch64__
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    float tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                    tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
+                    tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f * ((std::min(std::max((tmp_f + 3), 0.0f), 6.0f)) * 0.166666667f);
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                float tmp_f = dequantize_qasymm8_signed(in, qi_in);
-                tmp_f       = tmp_f > 0 ? tmp_f : tmp_f * a_f32;
-                tmp         = quantize_qasymm8_signed(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
index 2aea6cb..891646e 100644
--- a/src/cpu/kernels/activation/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/neon/qsymm16.cpp

@@ -21,11 +21,12 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/NESymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -38,7 +39,10 @@
 {
 namespace cpu
 {
-void neon_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void neon_qsymm16_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     constexpr int                                 window_step_x  = 8;
     const auto                                    window_start_x = static_cast<int>(window.x().start());
@@ -59,103 +63,94 @@
     const float                   a_f32    = act_info.a();
     const float                   b_f32    = act_info.b();
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
-
-        wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
-        ARM_COMPUTE_UNUSED(tmp);
-
-        // Compute S elements per iteration
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            const auto input_ptr  = reinterpret_cast<const qsymm16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<qsymm16_t *>(output.ptr());
+
+            wrapper::traits::neon_bitvector_t<qsymm16_t, wrapper::traits::BitWidth::W128> tmp;
+            ARM_COMPUTE_UNUSED(tmp);
+
+            // Compute S elements per iteration
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
+                const auto vin = wrapper::vloadq(input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[0])))),
                         wrapper::vdiv(vconst_1, wrapper::vadd(vconst_1, wrapper::vexpq(wrapper::vneg(vin_deq.val[1])))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
                 {
-                    {
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[0], vb_f32))),
                         wrapper::vmul(va_f32, wrapper::vtanh(wrapper::vmul(vin_deq.val[1], vb_f32))),
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
-            }
+                    }};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
 
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // De-quantize
-                const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
-                // Perform activation
-                const float32x4x2_t tmp_dep =
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
                 {
-                    {
-                        wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
-                        wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))
-                    }
-                };
-                // Re-quantize to new output space
-                tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                    // De-quantize
+                    const auto vin_deq = vdequantize_int16(vin, qi_in.scale);
+                    // Perform activation
+                    const float32x4x2_t tmp_dep = {{wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[0])),
+                                                    wrapper::vmin(va_f32, wrapper::vmax(vb_f32, vin_deq.val[1]))}};
+                    // Re-quantize to new output space
+                    tmp = vquantize_int16(tmp_dep, qi_out.scale);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                wrapper::vstore(output_ptr + x, tmp);
             }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            wrapper::vstore(output_ptr + x, tmp);
-        }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
-            qsymm16_t tmp = 0;
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
+                qsymm16_t in  = *(reinterpret_cast<const qsymm16_t *>(input_ptr + x));
+                qsymm16_t tmp = 0;
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = 1.f / (1.f + std::exp(-tmp_f));
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    float tmp_f = dequantize_qsymm16(in, qi_in.scale);
+                    tmp_f       = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
+                    tmp         = quantize_qsymm16(tmp_f, qi_out);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                *(output_ptr + x) = tmp;
             }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = a_f32 * std::tanh(b_f32 * tmp_f);
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                float tmp_f = dequantize_qsymm16(in, qi_in.scale);
-                tmp_f       = std::min<float>(a_f32, std::max<float>(b_f32, tmp_f));
-                tmp         = quantize_qsymm16(tmp_f, qi_out);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/activation/generic/sve/fp16.cpp b/src/cpu/kernels/activation/generic/sve/fp16.cpp
index 4757c60..97399e0 100644
--- a/src/cpu/kernels/activation/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp16.cpp

@@ -29,12 +29,12 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
+#include "src/core/NEON/SVEMath.h"
+
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#include "src/core/NEON/SVEMath.h"
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace cpu
@@ -59,77 +59,87 @@
 
     const auto va = svdup_n_f16(act_info.a());
     const auto vb = svdup_n_f16(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
-
-        svfloat16_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = svld1_f16(pg, input_ptr + x);
-            switch(act)
+            const auto input_ptr  = reinterpret_cast<const float16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
+
+            svfloat16_t tmp;
+
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f16_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f16_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va), svmax_f16_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin, svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f16_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f16_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f16_z(pg, vin, svmul_f16_z(pg, const_inv_6, svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = svmul_f16_z(pg, vin, svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f16(pg, output_ptr + x, tmp);
+                const auto vin = svld1_f16(pg, input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = svabs_f16_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = svmla_f16_z(pg, vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = svinv_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, svneg_f16_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = svmax_f16_z(pg, const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = svmin_f16_z(pg, va, svmax_f16_z(pg, vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = svadd_f16_z(pg, svmul_f16_z(pg, svmin_f16_z(pg, vin, const_0), va),
+                                          svmax_f16_z(pg, vin, const_0));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = svlog_f16_z(pg, svadd_f16_z(pg, const_1, svexp_f16_z(pg, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = svsel_f16(svcmpgt_f16(pg, vin, const_0), vin,
+                                        svmul_f16_z(pg, va, svsub_f16_z(pg, svexp_f16_z(pg, vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = svsqrt_f16_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = svmul_f16_z(pg, vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = svmul_f16_z(pg, va, svtanh_f16_z(pg, svmul_f16_z(pg, vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = svmul_f16_z(
+                            pg, vin,
+                            svmul_f16_z(
+                                pg, const_inv_6,
+                                svmin_f16_z(pg, const_6, svmax_f16_z(pg, const_0, svadd_f16_z(pg, vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = svmul_f16_z(
+                            pg, vin,
+                            svinv_f16_z(pg, svadd_f16_z(pg, const_1,
+                                                        svexp_f16_z(pg, svneg_f16_z(pg, svmul_f16_z(pg, va, vin))))));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                svst1_f16(pg, output_ptr + x, tmp);
 
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
+                x += svcnth();
+                pg = svwhilelt_b16(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/activation/generic/sve/fp32.cpp b/src/cpu/kernels/activation/generic/sve/fp32.cpp
index 87f04c2..d1b075d 100644
--- a/src/cpu/kernels/activation/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/activation/generic/sve/fp32.cpp

@@ -26,13 +26,13 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/NEON/SVEMath.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace cpu
@@ -58,78 +58,89 @@
 
     const auto va = svdup_n_f32(act_info.a());
     const auto vb = svdup_n_f32(act_info.b());
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<float *>(output.ptr());
-
-        svfloat32_t tmp;
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = svld1_f32(pg, input_ptr + x);
-            switch(act)
+            const auto input_ptr  = reinterpret_cast<const float *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<float *>(output.ptr());
+
+            svfloat32_t tmp;
+
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
             {
-                case ActivationLayerInfo::ActivationFunction::ABS:
-                    tmp = svabs_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LINEAR:
-                    tmp = svmla_f32_z(pg, vb, va, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LOGISTIC:
-                    tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::RELU:
-                    tmp = svmax_f32_z(pg, const_0, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
-                    tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
-                    tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va), svmax_f32_z(pg, vin, const_0));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
-                    tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin, svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::ELU:
-                    tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin, svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQRT:
-                    tmp = svsqrt_f32_z(pg, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SQUARE:
-                    tmp = svmul_f32_z(pg, vin, vin);
-                    break;
-                case ActivationLayerInfo::ActivationFunction::TANH:
-                    tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::IDENTITY:
-                    tmp = vin;
-                    break;
-                case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
-                    tmp = svmul_f32_z(pg, vin, svmul_f32_z(pg, const_inv_6, svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
-                    break;
-                case ActivationLayerInfo::ActivationFunction::SWISH:
-                    tmp = svmul_f32_z(pg, vin, svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
-                    break;
-                default:
-                    ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-            svst1_f32(pg, output_ptr + x, tmp);
+                const auto vin = svld1_f32(pg, input_ptr + x);
+                switch (act)
+                {
+                    case ActivationLayerInfo::ActivationFunction::ABS:
+                        tmp = svabs_f32_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LINEAR:
+                        tmp = svmla_f32_z(pg, vb, va, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LOGISTIC:
+                        tmp = svinv_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, svneg_f32_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::RELU:
+                        tmp = svmax_f32_z(pg, const_0, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::BOUNDED_RELU:
+                        tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, const_0, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU:
+                        tmp = svmin_f32_z(pg, va, svmax_f32_z(pg, vb, vin));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::LEAKY_RELU:
+                        tmp = svadd_f32_z(pg, svmul_f32_z(pg, svmin_f32_z(pg, vin, const_0), va),
+                                          svmax_f32_z(pg, vin, const_0));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SOFT_RELU:
+                        tmp = svsel_f32(svcmpgt_f32(pg, vin, soft_relu_thresh), vin,
+                                        svlog_f32_z(pg, svadd_f32_z(pg, const_1, svexp_f32_z(pg, vin))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::ELU:
+                        tmp = svsel_f32(svcmpgt_f32(pg, vin, const_0), vin,
+                                        svmul_f32_z(pg, va, svsub_f32_z(pg, svexp_f32_z(pg, vin), const_1)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQRT:
+                        tmp = svsqrt_f32_z(pg, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SQUARE:
+                        tmp = svmul_f32_z(pg, vin, vin);
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::TANH:
+                        tmp = svmul_f32_z(pg, va, svtanh_f32_z(pg, svmul_f32_z(pg, vb, vin)));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::IDENTITY:
+                        tmp = vin;
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::HARD_SWISH:
+                        tmp = svmul_f32_z(
+                            pg, vin,
+                            svmul_f32_z(
+                                pg, const_inv_6,
+                                svmin_f32_z(pg, const_6, svmax_f32_z(pg, const_0, svadd_f32_z(pg, vin, const_3)))));
+                        break;
+                    case ActivationLayerInfo::ActivationFunction::SWISH:
+                        tmp = svmul_f32_z(
+                            pg, vin,
+                            svinv_f32_z(pg, svadd_f32_z(pg, const_1,
+                                                        svexp_f32_z(pg, svneg_f32_z(pg, svmul_f32_z(pg, va, vin))))));
+                        break;
+                    default:
+                        ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
+                svst1_f32(pg, output_ptr + x, tmp);
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
 
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/activation/generic/sve2/lut.cpp b/src/cpu/kernels/activation/generic/sve2/lut.cpp
index d65de8d..2ed667d 100644
--- a/src/cpu/kernels/activation/generic/sve2/lut.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/lut.cpp

@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
@@ -33,19 +34,22 @@
 #ifdef __aarch64__
 void sve2_q8_activation_lut(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
 {
-    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 && src->info()->data_type() != DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_ERROR_ON(src->info()->data_type() != DataType::QASYMM8 &&
+                         src->info()->data_type() != DataType::QASYMM8_SIGNED);
     const auto window_end_x  = window.x().end();
     Window     win_collapsed = window.collapse_if_possible(window, Window::DimZ);
     win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(src, win_collapsed);
     Iterator output(dst, win_collapsed);
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = input.ptr();
-        auto       output_ptr = output.ptr();
-        lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
-    },
-    input, output);
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
+        {
+            const auto input_ptr  = input.ptr();
+            auto       output_ptr = output.ptr();
+            lut_u8_sve2(act_info.lut().data(), 1u, window_end_x, &input_ptr, &output_ptr);
+        },
+        input, output);
 }
 #endif // __aarch64__
 } // namespace cpu

diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
index bc9bc7a..7efa9e4 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8.cpp

@@ -26,18 +26,21 @@
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
-#include <cmath>
-#include <cstddef>
-
 #include "src/core/NEON/SVEAsymm.h"
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -61,7 +64,7 @@
 
     // Initialise scale/offset for re-quantization
     bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
     {
         requant = false;
     }
@@ -78,139 +81,160 @@
     const auto vo_s32     = svdup_n_s32(o_s32);
 
     // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    s_leaky_s32  = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_leaky_s32  = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                    arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
     const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
     const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-        svuint8_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = svld1_u8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_u8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+            const auto input_ptr  = reinterpret_cast<const uint8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
 
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
+            svuint8_t tmp;
+
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
             {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
-
-                // Re-quantize to new output space
-                tmp = svquantize_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
-
-                // Expand to int32
-                const svint32x4_t vin_s32 = svcreate4_s32(
-                                                svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
-                                                svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
-                                                svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
-                                                svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
-
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
+                const auto vin = svld1_u8(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
                 {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmax_u8_z(pg, vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_u8_z(pg, va, svmax_u8_z(pg, vb, vin));
+                    // Re-quantize to new output space
+                    tmp = svmla_qasymm8_z(pg, tmp, vs, vo);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+
+                    // Re-quantize to new output space
+                    tmp = svquantize_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+
+                    // Re-quantize to new output space
+                    tmp = svquantize_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    svbool_t    p0, p1, p2, p3;
+                    svint32x4_t tmp_dep;
+
+                    // Expand to int32
+                    const svint32x4_t vin_s32 = svcreate4_s32(svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(vin))),
+                                                              svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(vin))));
+
+                    // Compare elements to input offset
+                    if (qi_in.scale >= 0)
+                    {
+                        p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+                    else
+                    {
+                        p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+
+                    // Multiply negative elements and requantize if necessary
+                    if (requant)
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+                                                      svsel(p0, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+                                                      svsel(p1, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+                                                      svsel(p2, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+                                                      svsel(p3, vs_leaky_s32, vs_s32)),
+                                          8));
+                    }
+                    else
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    }
+
+                    // Convert uint32 vectors to uint16 vectors (with saturation)
+                    const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                    const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                    // convert uint16 vectors to uint8 vectors (with saturation)
+                    tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
                 }
                 else
                 {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
                 }
 
-                // Multiply negative elements and requantize if necessary
-                if(requant)
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
-                }
-                else
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
-                }
+                svst1_u8(pg, output_ptr + x, tmp);
 
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_u16  = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_u16 = svqxtunt_s32(svqxtunb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+                x += svcntb();
+                pg = svwhilelt_b8(x, window_end_x);
 
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_u16(svqxtnb_u16(v_low_u16), v_high_u16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_u8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b8(), pg));
-
-    },
-    input, output);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
index d20684f..e466752 100644
--- a/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qasymm8_signed.cpp

@@ -24,20 +24,23 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
-#include <cmath>
-#include <cstddef>
 
 #include "src/core/NEON/SVEAsymm.h"
 #include "src/core/NEON/SVEMath.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_signed_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qasymm8_signed_activation(const ITensor             *src,
+                                    ITensor                   *dst,
+                                    const ActivationLayerInfo &act_info,
+                                    const Window              &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -65,7 +68,7 @@
 
     // Initialise scale/offset for re-quantization
     bool requant = true;
-    if(qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
+    if (qi_in.scale == qi_out.scale && qi_in.offset == qi_out.offset)
     {
         requant = false;
     }
@@ -82,151 +85,190 @@
     const auto vo_s32     = svdup_n_s32(o_s32);
 
     // Initialise scale/offset for re-quantization for leaky relu
-    int32_t s_leaky_s32 = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
-    int32_t o_leaky_s32 = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
-                                arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    s_leaky_s32  = round(s * act_info.a() * (1 << 8), arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
+    int32_t    o_leaky_s32  = round((-qi_in.offset * s * act_info.a() + qi_out.offset) * (1 << 8),
+                                    arm_compute::RoundingPolicy::TO_NEAREST_EVEN);
     const auto vs_leaky_s32 = svdup_n_s32(s_leaky_s32);
     const auto vo_leaky_s32 = svdup_n_s32(o_leaky_s32);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-        svint8_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = svld1_s8(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::RELU)
-            {
-                // Perform activation
-                tmp = svmax_s8_z(pg, vconst_0, vin);
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // Perform activation
-                tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
-                // Re-quantize to new output space
-                tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
-                                                  svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
-                                                  svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-            {
-                // De-quantize
-                const auto vin_deq = svdequantize_z(pg, vin, qi_in);
-                // Perform activation
-                const svfloat32x4_t tmp_dep = svcreate4_f32(
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 0), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 0), const_3_f32))))),
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 1), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 1), const_3_f32))))),
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 2), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 2), const_3_f32))))),
-                                                  svmul_f32_z(pg, svget4_f32(vin_deq, 3), svmul_f32_z(pg, const_inv_6_f32, svmin_f32_z(pg, const_6_f32, svmax_f32_z(pg, const_0_f32, svadd_f32_z(pg, svget4_f32(vin_deq, 3), const_3_f32))))));
-                // Re-quantize to new output space
-                tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
-            {
-                svbool_t    p0, p1, p2, p3;
-                svint32x4_t tmp_dep;
+            const auto input_ptr  = reinterpret_cast<const int8_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
 
-                // Expand to int32
-                const svint32x4_t vin_s32 = svcreate4_s32(
-                                                svmovlb_s32(svmovlb_s16(vin)),
-                                                svmovlt_s32(svmovlb_s16(vin)),
-                                                svmovlb_s32(svmovlt_s16(vin)),
-                                                svmovlt_s32(svmovlt_s16(vin)));
+            svint8_t tmp;
 
-                // Compare elements to input offset
-                if(qi_in.scale >= 0)
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                const auto vin = svld1_s8(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::RELU)
                 {
-                    p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    // Perform activation
+                    tmp = svmax_s8_z(pg, vconst_0, vin);
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vconst_0, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // Perform activation
+                    tmp = svmin_s8_z(pg, va, svmax_s8_z(pg, vb, vin));
+                    // Re-quantize to new output space
+                    tmp = requant ? svmla_qasymm8_signed_z(pg, tmp, vs, vo) : tmp;
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 1))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 2))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget4_f32(vin_deq, 3))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 1), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 2), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget4_f32(vin_deq, 3), vb_f32))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
+                {
+                    // De-quantize
+                    const auto vin_deq = svdequantize_z(pg, vin, qi_in);
+                    // Perform activation
+                    const svfloat32x4_t tmp_dep = svcreate4_f32(
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 0),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 0),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 1),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 1),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 2),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 2),
+                                                                                    const_3_f32))))),
+                        svmul_f32_z(pg, svget4_f32(vin_deq, 3),
+                                    svmul_f32_z(pg, const_inv_6_f32,
+                                                svmin_f32_z(pg, const_6_f32,
+                                                            svmax_f32_z(pg, const_0_f32,
+                                                                        svadd_f32_z(pg, svget4_f32(vin_deq, 3),
+                                                                                    const_3_f32))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_signed_z(pg, tmp_dep, qi_out);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU)
+                {
+                    svbool_t    p0, p1, p2, p3;
+                    svint32x4_t tmp_dep;
+
+                    // Expand to int32
+                    const svint32x4_t vin_s32 =
+                        svcreate4_s32(svmovlb_s32(svmovlb_s16(vin)), svmovlt_s32(svmovlb_s16(vin)),
+                                      svmovlb_s32(svmovlt_s16(vin)), svmovlt_s32(svmovlt_s16(vin)));
+
+                    // Compare elements to input offset
+                    if (qi_in.scale >= 0)
+                    {
+                        p0 = svcmplt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmplt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmplt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmplt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+                    else
+                    {
+                        p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
+                        p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
+                        p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
+                        p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    }
+
+                    // Multiply negative elements and requantize if necessary
+                    if (requant)
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0),
+                                                      svsel(p0, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1),
+                                                      svsel(p1, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2),
+                                                      svsel(p2, vs_leaky_s32, vs_s32)),
+                                          8),
+                            svasr_n_s32_m(pg,
+                                          svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3),
+                                                      svsel(p3, vs_leaky_s32, vs_s32)),
+                                          8));
+                    }
+                    else
+                    {
+                        tmp_dep = svcreate4_s32(
+                            svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
+                            svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
+                    }
+
+                    // Convert uint32 vectors to uint16 vectors (with saturation)
+                    const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
+                    const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+
+                    // convert uint16 vectors to uint8 vectors (with saturation)
+                    tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
                 }
                 else
                 {
-                    p0 = svcmpgt_s32(pg, svget4_s32(vin_s32, 0), voffset_in);
-                    p1 = svcmpgt_s32(pg, svget4_s32(vin_s32, 1), voffset_in);
-                    p2 = svcmpgt_s32(pg, svget4_s32(vin_s32, 2), voffset_in);
-                    p3 = svcmpgt_s32(pg, svget4_s32(vin_s32, 3), voffset_in);
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
                 }
 
-                // Multiply negative elements and requantize if necessary
-                if(requant)
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p0, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 0), svsel(p0, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p1, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 1), svsel(p1, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p2, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 2), svsel(p2, vs_leaky_s32, vs_s32)), 8),
-                                  svasr_n_s32_m(pg, svmla_s32_m(pg, svsel(p3, vo_leaky_s32, vo_s32), svget4_s32(vin_s32, 3), svsel(p3, vs_leaky_s32, vs_s32)), 8));
-                }
-                else
-                {
-                    tmp_dep = svcreate4_s32(
-                                  svasr_n_s32_m(p0, svmad_s32_m(p0, svget4_s32(vin_s32, 0), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p1, svmad_s32_m(p1, svget4_s32(vin_s32, 1), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p2, svmad_s32_m(p2, svget4_s32(vin_s32, 2), vs_leaky_s32, vo_leaky_s32), 8),
-                                  svasr_n_s32_m(p3, svmad_s32_m(p3, svget4_s32(vin_s32, 3), vs_leaky_s32, vo_leaky_s32), 8));
-                }
+                svst1_s8(pg, output_ptr + x, tmp);
 
-                // Convert uint32 vectors to uint16 vectors (with saturation)
-                const auto v_low_s16  = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 0)), svget4_s32(tmp_dep, 1));
-                const auto v_high_s16 = svqxtnt_s32(svqxtnb_s32(svget4_s32(tmp_dep, 2)), svget4_s32(tmp_dep, 3));
+                x += svcntb();
+                pg = svwhilelt_b8(x, window_end_x);
 
-                // convert uint16 vectors to uint8 vectors (with saturation)
-                tmp = svqxtnt_s16(svqxtnb_s16(v_low_s16), v_high_s16);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
-
-            svst1_s8(pg, output_ptr + x, tmp);
-
-            x += svcntb();
-            pg = svwhilelt_b8(x, window_end_x);
-
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    input, output);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
index 5154fac..f955893 100644
--- a/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp
+++ b/src/cpu/kernels/activation/generic/sve2/qsymm16.cpp

@@ -21,24 +21,27 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
-#include <cmath>
-#include <cstddef>
-
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/SVESymm.h"
+
 #include <arm_sve.h>
+#include <cmath>
+#include <cstddef>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qsymm16_activation(const ITensor *src, ITensor *dst, const ActivationLayerInfo &act_info, const Window &window)
+void sve2_qsymm16_activation(const ITensor             *src,
+                             ITensor                   *dst,
+                             const ActivationLayerInfo &act_info,
+                             const Window              &window)
 {
     const auto                                    window_start_x = static_cast<int>(window.x().start());
     const auto                                    window_end_x   = static_cast<int>(window.x().end());
@@ -56,62 +59,70 @@
     const auto                    va_f32   = svdup_n_f32(act_info.a());
     const auto                    vb_f32   = svdup_n_f32(act_info.b());
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
-        const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-        svint16_t tmp;
-
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            const auto vin = svld1_s16(pg, input_ptr + x);
-            if(act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep = svcreate2_f32(svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
-                                                            svdiv_f32_z(pg, vconst_1, svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::TANH)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep = svcreate2_f32(svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
-                                                            svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else if(act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-            {
-                // De-quantize
-                auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
-                // Perform activation
-                const svfloat32x2_t tmp_dep = svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
-                                                            svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
-                // Re-quantize to new output space
-                tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
-            }
-            else
-            {
-                ARM_COMPUTE_ERROR("Unsupported activation function");
-            }
+            const auto input_ptr  = reinterpret_cast<const int16_t *>(input.ptr());
+            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
 
-            svst1_s16(pg, output_ptr + x, tmp);
+            svint16_t tmp;
 
-            x += svcnth();
-            pg = svwhilelt_b16(x, window_end_x);
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                const auto vin = svld1_s16(pg, input_ptr + x);
+                if (act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep = svcreate2_f32(
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 0))))),
+                        svdiv_f32_z(
+                            pg, vconst_1,
+                            svadd_f32_z(pg, vconst_1, svexp_f32_z(pg, svneg_f32_z(pg, svget2_f32(vin_deq, 1))))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::TANH)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep = svcreate2_f32(
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 0), vb_f32))),
+                        svmul_f32_z(pg, va_f32, svtanh_f32_z(pg, svmul_f32_z(pg, svget2_f32(vin_deq, 1), vb_f32))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else if (act == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+                {
+                    // De-quantize
+                    auto vin_deq = svdequantize_qsymm16_z(pg, vin, qi_in.scale);
+                    // Perform activation
+                    const svfloat32x2_t tmp_dep =
+                        svcreate2_f32(svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 0))),
+                                      svmin_f32_z(pg, va_f32, svmax_f32_z(pg, vb_f32, svget2_f32(vin_deq, 1))));
+                    // Re-quantize to new output space
+                    tmp = svquantize_qsymm16_z(pg, tmp_dep, qi_out.scale);
+                }
+                else
+                {
+                    ARM_COMPUTE_ERROR("Unsupported activation function");
+                }
 
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    input, output);
+                svst1_s16(pg, output_ptr + x, tmp);
+
+                x += svcnth();
+                pg = svwhilelt_b16(x, window_end_x);
+
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        input, output);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/neon/fp16.cpp b/src/cpu/kernels/add/generic/neon/fp16.cpp
index fca7b2c..e7679c1 100644
--- a/src/cpu/kernels/add/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/add/generic/neon/fp16.cpp

@@ -30,10 +30,11 @@
 {
 namespace cpu
 {
-void add_fp16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<float16_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/add/generic/neon/fp32.cpp b/src/cpu/kernels/add/generic/neon/fp32.cpp
index 1f599b19..11a970b 100644
--- a/src/cpu/kernels/add/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/add/generic/neon/fp32.cpp

@@ -28,9 +28,10 @@
 {
 namespace cpu
 {
-void add_fp32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp32_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<float>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/neon/impl.cpp b/src/cpu/kernels/add/generic/neon/impl.cpp
index 2dde135..34938cc 100644
--- a/src/cpu/kernels/add/generic/neon/impl.cpp
+++ b/src/cpu/kernels/add/generic/neon/impl.cpp

@@ -23,8 +23,10 @@
  */
 
 #include "src/cpu/kernels/add/generic/neon/impl.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
@@ -40,7 +42,10 @@
     return add_sub_q8_neon_fixedpoint_possible(src0, src1, dst, true);
 }
 
-bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition)
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                         const ITensorInfo *src1,
+                                         const ITensorInfo *dst,
+                                         bool               is_addition)
 {
     const auto iq0 = src0->quantization_info().uniform();
     const auto iq1 = src1->quantization_info().uniform();
@@ -49,7 +54,7 @@
     const auto scale0 = iq0.scale / oq.scale;
     const auto scale1 = iq1.scale / oq.scale;
 
-    if(scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f)
+    if (scale0 < -15.f || scale0 > 15.f || scale1 < -15.f || scale1 > 15.f)
     {
         // The scale factor cannot be stored as 5.11 signed fixed-point number.
         return false;
@@ -57,9 +62,10 @@
 
     const auto offset = float(oq.offset) - scale0 * float(iq0.offset) - scale1 * float(iq1.offset);
 
-    const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset)) : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset));
+    const auto max_acc = is_addition ? ((std::abs(scale0) + std::abs(scale1)) * 256.f + std::abs(offset))
+                                     : ((std::abs(scale0) - std::abs(scale1)) * 256.f + std::abs(offset));
 
-    if(max_acc > 1048575.f) // 2^20 - 1
+    if (max_acc > 1048575.f) // 2^20 - 1
     {
         // It might not be possible to store the result as 21.11 signed fixed-point number.
         return false;
@@ -69,13 +75,19 @@
 }
 
 template <typename ScalarType>
-void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_q8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_q8_neon_fixedpoint<ScalarType>(src0, src1, dst, policy, window, true /*is_addition*/);
 }
 
 template <typename ScalarType>
-void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_q8_neon_fixedpoint(const ITensor       *src0,
+                                const ITensor       *src1,
+                                ITensor             *dst,
+                                const ConvertPolicy &policy,
+                                const Window        &window,
+                                bool                 is_addition)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -103,7 +115,7 @@
     const auto oq_info   = dst->info()->quantization_info().uniform();
     const auto in0_scale = iq0_info.scale / oq_info.scale;
     const auto in1_scale = is_addition ? (iq1_info.scale / oq_info.scale) : (-(iq1_info.scale / oq_info.scale));
-    const auto offset    = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
+    const auto offset = float(oq_info.offset) - in0_scale * float(iq0_info.offset) - in1_scale * float(iq1_info.offset);
 
     constexpr float _2pow11        = 2048;
     const auto      in0_scale_5p11 = static_cast<int16_t>(support::cpp11::lround(in0_scale * _2pow11));
@@ -112,7 +124,7 @@
 
     constexpr uint8_t shift_amount_remainder = 3;
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Prefix: a = non-broadcast, b = broadcast.
 
@@ -138,68 +150,75 @@
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
-            const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
+            win,
+            [&](const Coordinates &)
+            {
+                const auto a_ptr   = reinterpret_cast<const ScalarType *>(a_input_it.ptr());
+                const auto b_ptr   = reinterpret_cast<const ScalarType *>(b_input_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
 
-            const auto b_val                    = *b_ptr;
-            const auto b_scaled                 = b_scale * b_val;
-            const auto b_scaled_21p11           = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11));
-            const auto b_scaled_offseted_21p11  = b_scaled_21p11 + offset_21p11;
-            const auto b_vscaled_offseted_21p11 = wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag());
+                const auto b_val                   = *b_ptr;
+                const auto b_scaled                = b_scale * b_val;
+                const auto b_scaled_21p11          = static_cast<int32_t>(support::cpp11::lround(b_scaled * _2pow11));
+                const auto b_scaled_offseted_21p11 = b_scaled_21p11 + offset_21p11;
+                const auto b_vscaled_offseted_21p11 =
+                    wrapper::vdup_n(b_scaled_offseted_21p11, wrapper::traits::vector_128_tag());
 
 #ifndef __aarch64__
-            const auto b_scaled_offseted = b_scaled + offset;
+                const auto b_scaled_offseted = b_scaled + offset;
 #endif // __aarch64__
 
-            int x = window_start_x;
+                int x = window_start_x;
 
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                // Load the input.
-                const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the input.
+                    const auto a_vin_8p0 = wrapper::vloadq(a_ptr + x);
 
-                // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
-                const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
-                const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
+                    // Widen the non-broadcast elements to signed 16-bit regardless of the input signedness.
+                    const auto a_vin_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(a_vin_8p0)));
+                    const auto a_vin_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(a_vin_8p0)));
 
-                // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
-                // Widen and store the result in 32-bit integer.
-                const auto vout_21p11_00 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11);
-                const auto vout_21p11_01 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11);
-                const auto vout_21p11_10 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11);
-                const auto vout_21p11_11 = wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11);
+                    // Multiply the non-broadcast elements by the scale factor, add the scaled broadcast elements and the offset.
+                    // Widen and store the result in 32-bit integer.
+                    const auto vout_21p11_00 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_0), a_vscale_5p11);
+                    const auto vout_21p11_01 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_0), a_vscale_5p11);
+                    const auto vout_21p11_10 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgetlow(a_vin_16p0_1), a_vscale_5p11);
+                    const auto vout_21p11_11 =
+                        wrapper::vmlal(b_vscaled_offseted_21p11, wrapper::vgethigh(a_vin_16p0_1), a_vscale_5p11);
 
-                // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
-                const auto vout_8p8_0 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
-                const auto vout_8p8_1 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+                    // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+                    const auto vout_8p8_0 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+                    const auto vout_8p8_1 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
 
-                // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<8>(vout_8p8_0),
-                                          wrapper::vqrshrn<8>(vout_8p8_1));
+                    // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
 
-                // Store the result.
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
+                    // Store the result.
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
 
-            // Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                // Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11));
+                    out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+                        int32_t(a_ptr[x]) * a_scale_5p11 + b_scaled_offseted_21p11));
 #else  // __aarch64__
-                out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
+                    out_ptr[x] = utility::clamp<int, ScalarType>(
+                        support::cpp11::lround(float(a_ptr[x]) * a_scale + b_scaled_offseted));
 #endif // __aarch64__
-            }
-        },
-        b_input_it, a_input_it, out_it);
+                }
+            },
+            b_input_it, a_input_it, out_it);
     }
     else
     {
@@ -216,70 +235,85 @@
         Iterator out_it(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
-            const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
-            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
-
-            int x = window_start_x;
-
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                // Load the inputs.
-                const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
-                const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
+                const auto in0_ptr = reinterpret_cast<const ScalarType *>(in0_it.ptr());
+                const auto in1_ptr = reinterpret_cast<const ScalarType *>(in1_it.ptr());
+                const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr());
 
-                // Widen the input elements to signed 16-bit regardless of the input signedness.
-                const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
-                const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
-                const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
-                const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
+                int x = window_start_x;
 
-                // Multiply the input elements by the scale factor and add the offset.
-                // Widen and store the result in 32-bit integer.
-                const auto vscaled0_offseted_21p11_00 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11);
-                const auto vscaled0_offseted_21p11_01 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11);
-                const auto vscaled0_offseted_21p11_10 = wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11);
-                const auto vscaled0_offseted_21p11_11 = wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11);
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    // Load the inputs.
+                    const auto vin0_8p0 = wrapper::vloadq(in0_ptr + x);
+                    const auto vin1_8p0 = wrapper::vloadq(in1_ptr + x);
 
-                const auto vout_21p11_00 = wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11);
-                const auto vout_21p11_01 = wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11);
-                const auto vout_21p11_10 = wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11);
-                const auto vout_21p11_11 = wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11);
+                    // Widen the input elements to signed 16-bit regardless of the input signedness.
+                    const auto vin0_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin0_8p0)));
+                    const auto vin0_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin0_8p0)));
+                    const auto vin1_16p0_0 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(vin1_8p0)));
+                    const auto vin1_16p0_1 = wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(vin1_8p0)));
 
-                // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
-                const auto vout_8p8_0 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
-                const auto vout_8p8_1 = wrapper::vcombine(
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
-                                            wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
+                    // Multiply the input elements by the scale factor and add the offset.
+                    // Widen and store the result in 32-bit integer.
+                    const auto vscaled0_offseted_21p11_00 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_0), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_01 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_0), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_10 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgetlow(vin0_16p0_1), vscale0_5p11);
+                    const auto vscaled0_offseted_21p11_11 =
+                        wrapper::vmlal(voffset_21p11, wrapper::vgethigh(vin0_16p0_1), vscale0_5p11);
 
-                // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
-                const auto vout_8p0 = wrapper::vcombine(
-                                          wrapper::vqrshrn<8>(vout_8p8_0),
-                                          wrapper::vqrshrn<8>(vout_8p8_1));
+                    const auto vout_21p11_00 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_00, wrapper::vgetlow(vin1_16p0_0), vscale1_5p11);
+                    const auto vout_21p11_01 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_01, wrapper::vgethigh(vin1_16p0_0), vscale1_5p11);
+                    const auto vout_21p11_10 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_10, wrapper::vgetlow(vin1_16p0_1), vscale1_5p11);
+                    const auto vout_21p11_11 =
+                        wrapper::vmlal(vscaled0_offseted_21p11_11, wrapper::vgethigh(vin1_16p0_1), vscale1_5p11);
 
-                // Store the result.
-                wrapper::vstore(out_ptr + x, vout_8p0);
-            }
+                    // Remove 3 bits of the fractional part, round, narrow to 16-bit and saturate the result.
+                    const auto vout_8p8_0 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_00),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_01));
+                    const auto vout_8p8_1 =
+                        wrapper::vcombine(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_10),
+                                          wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(vout_21p11_11));
 
-            // Process the left-over elements.
-            for(; x < window_end_x; ++x)
-            {
+                    // Remove 8 bits of the fractional part, round, narrow to 8-bit and saturate the result.
+                    const auto vout_8p0 =
+                        wrapper::vcombine(wrapper::vqrshrn<8>(vout_8p8_0), wrapper::vqrshrn<8>(vout_8p8_1));
+
+                    // Store the result.
+                    wrapper::vstore(out_ptr + x, vout_8p0);
+                }
+
+                // Process the left-over elements.
+                for (; x < window_end_x; ++x)
+                {
 #ifdef __aarch64__
-                out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11));
+                    out_ptr[x] = wrapper::vqrshrn<8>(wrapper::vqrshrn_ex<shift_amount_remainder, ScalarType>(
+                        int32_t(in0_ptr[x]) * in0_scale_5p11 + int32_t(in1_ptr[x]) * in1_scale_5p11 + offset_21p11));
 #else  // __aarch64__
-                out_ptr[x] = utility::clamp<int, ScalarType>(support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
+                    out_ptr[x] = utility::clamp<int, ScalarType>(
+                        support::cpp11::lround(float(in0_ptr[x]) * in0_scale + float(in1_ptr[x]) * in1_scale + offset));
 #endif // __aarch64__
-            }
-        },
-        in0_it, in1_it, out_it);
+                }
+            },
+            in0_it, in1_it, out_it);
     }
 }
 
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -304,7 +338,7 @@
     const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
     const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -324,63 +358,64 @@
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = non_broadcast_input.ptr();
-            const auto output_ptr              = output.ptr();
-
-            const auto broadcast_value = *broadcast_input.ptr();
-            const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
-            const auto bfs             = float(broadcast_value) * bf_scale + offset;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = non_broadcast_input.ptr();
+                const auto output_ptr              = output.ptr();
 
-                const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
-                const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+                const auto broadcast_value = *broadcast_input.ptr();
+                const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+                const auto bfs             = float(broadcast_value) * bf_scale + offset;
 
-                const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x16_t a = vld1q_u8(non_broadcast_input_ptr + x);
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                    const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+                    const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+
+                    const auto af_0 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(bf, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(af_0);
-                rf_1 = vcvtnq_s32_f32(af_1);
-                rf_2 = vcvtnq_s32_f32(af_2);
-                rf_3 = vcvtnq_s32_f32(af_3);
+                    rf_0 = vcvtnq_s32_f32(af_0);
+                    rf_1 = vcvtnq_s32_f32(af_1);
+                    rf_2 = vcvtnq_s32_f32(af_2);
+                    rf_3 = vcvtnq_s32_f32(af_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(af_0);
-                rf_1 = vcvtq_s32_f32(af_1);
-                rf_2 = vcvtq_s32_f32(af_2);
-                rf_3 = vcvtq_s32_f32(af_3);
+                    rf_0          = vcvtq_s32_f32(af_0);
+                    rf_1          = vcvtq_s32_f32(af_1);
+                    rf_2          = vcvtq_s32_f32(af_2);
+                    rf_3          = vcvtq_s32_f32(af_3);
 #endif //__aarch64__
 
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
+                    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -397,72 +432,78 @@
         const auto voffset = vdupq_n_f32(offset);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = input1.ptr();
-            const auto input2_ptr = input2.ptr();
-            const auto output_ptr = output.ptr();
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const uint8x16_t a = vld1q_u8(input1_ptr + x);
-                const uint8x16_t b = vld1q_u8(input2_ptr + x);
+                const auto input1_ptr = input1.ptr();
+                const auto input2_ptr = input2.ptr();
+                const auto output_ptr = output.ptr();
 
-                const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
-                const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
-                const auto b_u16_0 = vmovl_u8(vget_low_u8(b));
-                const auto b_u16_1 = vmovl_u8(vget_high_u8(b));
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const uint8x16_t a = vld1q_u8(input1_ptr + x);
+                    const uint8x16_t b = vld1q_u8(input2_ptr + x);
 
-                const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
+                    const auto a_u16_0 = vmovl_u8(vget_low_u8(a));
+                    const auto a_u16_1 = vmovl_u8(vget_high_u8(a));
+                    const auto b_u16_0 = vmovl_u8(vget_low_u8(b));
+                    const auto b_u16_1 = vmovl_u8(vget_high_u8(b));
 
-                const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2);
-                const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2);
-                const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2);
-                const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2);
+                    const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_low_u16(a_u16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_u32(vmovl_u16(vget_high_u16(a_u16_1))), vscale1);
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                    const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_0))), vscale2);
+                    const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_0))), vscale2);
+                    const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_u32(vmovl_u16(vget_low_u16(b_u16_1))), vscale2);
+                    const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_u32(vmovl_u16(vget_high_u16(b_u16_1))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(bf_0);
-                rf_1 = vcvtnq_s32_f32(bf_1);
-                rf_2 = vcvtnq_s32_f32(bf_2);
-                rf_3 = vcvtnq_s32_f32(bf_3);
+                    rf_0 = vcvtnq_s32_f32(bf_0);
+                    rf_1 = vcvtnq_s32_f32(bf_1);
+                    rf_2 = vcvtnq_s32_f32(bf_2);
+                    rf_3 = vcvtnq_s32_f32(bf_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(bf_0);
-                rf_1 = vcvtq_s32_f32(bf_1);
-                rf_2 = vcvtq_s32_f32(bf_2);
-                rf_3 = vcvtq_s32_f32(bf_3);
+                    rf_0          = vcvtq_s32_f32(bf_0);
+                    rf_1          = vcvtq_s32_f32(bf_1);
+                    rf_2          = vcvtq_s32_f32(bf_2);
+                    rf_3          = vcvtq_s32_f32(bf_3);
 #endif //__aarch64__
 
-                const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
-            }
+                    const uint8x8_t pa = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const uint8x8_t pb = vqmovun_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_u8(output_ptr + x, vcombine_u8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, uint8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        input1, input2, output);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition)
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -487,7 +528,7 @@
     const auto scale2 = is_addition ? (iq2_info.scale / oq_info.scale) : (-(iq2_info.scale / oq_info.scale));
     const auto offset = float(oq_info.offset) - scale1 * float(iq1_info.offset) - scale2 * float(iq2_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -507,63 +548,64 @@
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
-            const auto bfs             = float(broadcast_value) * bf_scale + offset;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
 
-                const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
-                const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+                const auto broadcast_value = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const auto bf              = vdupq_n_f32(float(broadcast_value) * scale2 + offset);
+                const auto bfs             = float(broadcast_value) * bf_scale + offset;
 
-                const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int8x16_t a = vld1q_s8(non_broadcast_input_ptr + x);
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                    const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+                    const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+
+                    const auto af_0 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(bf, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(af_0);
-                rf_1 = vcvtnq_s32_f32(af_1);
-                rf_2 = vcvtnq_s32_f32(af_2);
-                rf_3 = vcvtnq_s32_f32(af_3);
+                    rf_0 = vcvtnq_s32_f32(af_0);
+                    rf_1 = vcvtnq_s32_f32(af_1);
+                    rf_2 = vcvtnq_s32_f32(af_2);
+                    rf_3 = vcvtnq_s32_f32(af_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(af_0);
-                rf_1 = vcvtq_s32_f32(af_1);
-                rf_2 = vcvtq_s32_f32(af_2);
-                rf_3 = vcvtq_s32_f32(af_3);
+                    rf_0          = vcvtq_s32_f32(af_0);
+                    rf_1          = vcvtq_s32_f32(af_1);
+                    rf_2          = vcvtq_s32_f32(af_2);
+                    rf_3          = vcvtq_s32_f32(af_3);
 #endif //__aarch64__
 
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
+                    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(non_broadcast_input_ptr[x]) * af_scale + bfs;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -580,79 +622,102 @@
         const auto voffset = vdupq_n_f32(offset);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const int8x16_t a = vld1q_s8(input1_ptr + x);
-                const int8x16_t b = vld1q_s8(input2_ptr + x);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
 
-                const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
-                const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
-                const auto b_s16_0 = vmovl_s8(vget_low_s8(b));
-                const auto b_s16_1 = vmovl_s8(vget_high_s8(b));
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int8x16_t a = vld1q_s8(input1_ptr + x);
+                    const int8x16_t b = vld1q_s8(input2_ptr + x);
 
-                const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
-                const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
-                const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
-                const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
+                    const auto a_s16_0 = vmovl_s8(vget_low_s8(a));
+                    const auto a_s16_1 = vmovl_s8(vget_high_s8(a));
+                    const auto b_s16_0 = vmovl_s8(vget_low_s8(b));
+                    const auto b_s16_1 = vmovl_s8(vget_high_s8(b));
 
-                const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2);
-                const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2);
-                const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2);
-                const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2);
+                    const auto af_0 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_0))), vscale1);
+                    const auto af_1 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_0))), vscale1);
+                    const auto af_2 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_low_s16(a_s16_1))), vscale1);
+                    const auto af_3 = vmlaq_f32(voffset, vcvtq_f32_s32(vmovl_s16(vget_high_s16(a_s16_1))), vscale1);
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
-                int32x4_t rf_2{};
-                int32x4_t rf_3{};
+                    const auto bf_0 = vmlaq_f32(af_0, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_0))), vscale2);
+                    const auto bf_1 = vmlaq_f32(af_1, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_0))), vscale2);
+                    const auto bf_2 = vmlaq_f32(af_2, vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_s16_1))), vscale2);
+                    const auto bf_3 = vmlaq_f32(af_3, vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_s16_1))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
+                    int32x4_t rf_2{};
+                    int32x4_t rf_3{};
 
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(bf_0);
-                rf_1 = vcvtnq_s32_f32(bf_1);
-                rf_2 = vcvtnq_s32_f32(bf_2);
-                rf_3 = vcvtnq_s32_f32(bf_3);
+                    rf_0 = vcvtnq_s32_f32(bf_0);
+                    rf_1 = vcvtnq_s32_f32(bf_1);
+                    rf_2 = vcvtnq_s32_f32(bf_2);
+                    rf_3 = vcvtnq_s32_f32(bf_3);
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(bf_0);
-                rf_1 = vcvtq_s32_f32(bf_1);
-                rf_2 = vcvtq_s32_f32(bf_2);
-                rf_3 = vcvtq_s32_f32(bf_3);
+                    rf_0          = vcvtq_s32_f32(bf_0);
+                    rf_1          = vcvtq_s32_f32(bf_1);
+                    rf_2          = vcvtq_s32_f32(bf_2);
+                    rf_3          = vcvtq_s32_f32(bf_3);
 #endif //__aarch64__
 
-                const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
-                const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
-                vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
-            }
+                    const int8x8_t pa = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1)));
+                    const int8x8_t pb = vqmovn_s16(vcombine_s16(vqmovn_s32(rf_2), vqmovn_s32(rf_3)));
+                    vst1q_s8(output_ptr + x, vcombine_s8(pa, pb));
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto result = float(input1_ptr[x]) * scale1 + float(input2_ptr[x]) * scale2 + offset;
 #ifdef __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::lround(result));
 #else  // __aarch64__
-                output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
+                    output_ptr[x] = utility::clamp<int, int8_t>(support::cpp11::trunc(result));
 #endif // __aarch64__
-            }
-        },
-        input1, input2, output);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-template void add_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_q8_neon_fixedpoint<int8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_q8_neon_fixedpoint<uint8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 
-template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+template void add_sub_q8_neon_fixedpoint<int8_t>(const ITensor       *src0,
+                                                 const ITensor       *src1,
+                                                 ITensor             *dst,
+                                                 const ConvertPolicy &policy,
+                                                 const Window        &window,
+                                                 bool                 is_addition);
+template void add_sub_q8_neon_fixedpoint<uint8_t>(const ITensor       *src0,
+                                                  const ITensor       *src1,
+                                                  ITensor             *dst,
+                                                  const ConvertPolicy &policy,
+                                                  const Window        &window,
+                                                  bool                 is_addition);
 
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition);
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition);
 
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/neon/impl.h b/src/cpu/kernels/add/generic/neon/impl.h
index fb786c5..faa99ba 100644
--- a/src/cpu/kernels/add/generic/neon/impl.h
+++ b/src/cpu/kernels/add/generic/neon/impl.h

@@ -26,8 +26,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -35,7 +36,8 @@
 namespace cpu
 {
 template <typename ScalarType>
-void add_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_same_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<ScalarType, wrapper::traits::BitWidth::W128>;
@@ -53,7 +55,7 @@
     const auto    window_end_x          = static_cast<int>(window.x().end());
     const bool    is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -69,31 +71,36 @@
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                const auto res             = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v) : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
-                wrapper::vstore(output_ptr + x, res);
-            }
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(broadcast_value, non_broadcast_v) : broadcast_value + non_broadcast_v;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    const auto res             = (policy == ConvertPolicy::SATURATE)
+                                                     ? wrapper::vqadd(broadcast_value_vec, non_broadcast_v)
+                                                     : wrapper::vadd(broadcast_value_vec, non_broadcast_v);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x)          = (policy == ConvertPolicy::SATURATE)
+                                                     ? wrapper::add_sat(broadcast_value, non_broadcast_v)
+                                                     : broadcast_value + non_broadcast_v;
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -106,31 +113,34 @@
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
-            }
-        },
-        input1, input2, output);
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto val1 = wrapper::vloadq(input1_ptr + x);
+                    const auto val2 = wrapper::vloadq(input2_ptr + x);
+                    const auto res =
+                        (policy == ConvertPolicy::SATURATE) ? wrapper::vqadd(val1, val2) : wrapper::vadd(val1, val2);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto val1 = *(input1_ptr + x);
+                    const auto val2 = *(input2_ptr + x);
+                    *(output_ptr + x) =
+                        (policy == ConvertPolicy::SATURATE) ? wrapper::add_sat(val1, val2) : val1 + val2;
+                }
+            },
+            input1, input2, output);
     }
 }
 
@@ -138,17 +148,36 @@
 
 bool sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst);
 
-bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, bool is_addition);
+bool add_sub_q8_neon_fixedpoint_possible(const ITensorInfo *src0,
+                                         const ITensorInfo *src1,
+                                         const ITensorInfo *dst,
+                                         bool               is_addition);
 
-void add_sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+void add_sub_qasymm8_neon(const ITensor       *src0,
+                          const ITensor       *src1,
+                          ITensor             *dst,
+                          const ConvertPolicy &policy,
+                          const Window        &window,
+                          bool                 is_addition);
 
-void add_sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+void add_sub_qasymm8_signed_neon(const ITensor       *src0,
+                                 const ITensor       *src1,
+                                 ITensor             *dst,
+                                 const ConvertPolicy &policy,
+                                 const Window        &window,
+                                 bool                 is_addition);
 
 template <typename ScalarType>
-void add_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+void add_q8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 
 template <typename ScalarType>
-void add_sub_q8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window, bool is_addition);
+void add_sub_q8_neon_fixedpoint(const ITensor       *src0,
+                                const ITensor       *src1,
+                                ITensor             *dst,
+                                const ConvertPolicy &policy,
+                                const Window        &window,
+                                bool                 is_addition);
 } // namespace cpu
 } // namespace arm_compute
 #endif // SRC_CORE_NEON_KERNELS_ADD_IMPL_H

diff --git a/src/cpu/kernels/add/generic/neon/integer.cpp b/src/cpu/kernels/add/generic/neon/integer.cpp
index 5698d6d..f0bcebc 100644
--- a/src/cpu/kernels/add/generic/neon/integer.cpp
+++ b/src/cpu/kernels/add/generic/neon/integer.cpp

@@ -28,19 +28,22 @@
 {
 namespace cpu
 {
-void add_u8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_u8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<uint8_t>(src0, src1, dst, policy, window);
 }
 
-void add_s16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<int16_t>(src0, src1, dst, policy, window);
 }
 
-void add_s32_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s32_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_neon<int32_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/neon/qasymm8.cpp b/src/cpu/kernels/add/generic/neon/qasymm8.cpp
index 69cca95..8195d22 100644
--- a/src/cpu/kernels/add/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8.cpp

@@ -23,15 +23,17 @@
  */
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_neon(src0, src1, dst, policy, window, true /*is_addition*/);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
index dfdf8fe..7e23096 100644
--- a/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/neon/qasymm8_signed.cpp

@@ -23,15 +23,17 @@
  */
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_signed_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, true /*is_addition*/);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/neon/qsymm16.cpp b/src/cpu/kernels/add/generic/neon/qsymm16.cpp
index e76e408..ac2de05 100644
--- a/src/cpu/kernels/add/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/neon/qsymm16.cpp

@@ -25,14 +25,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qsymm16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -57,7 +59,7 @@
     const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
         Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -65,7 +67,7 @@
         const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
         const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -74,48 +76,50 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
-            const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
-            const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
-            const float bfs  = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const int16x8_t a    = vld1q_s16(non_broadcast_input_ptr + x);
-                const auto      af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto      af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
+                const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+
+                const auto  bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2);
+                const auto  bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2);
+                const float bfs  = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t a    = vld1q_s16(non_broadcast_input_ptr + x);
+                    const auto      af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+                    const auto      af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #endif //__aarch64__
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+                    vst1q_s16(output_ptr + x, pa);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs + bfs), oq_info);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -127,48 +131,50 @@
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
 
-                const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
-                const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
-                const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
-                const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const int16x8_t a = vld1q_s16(input1_ptr + x);
+                    const int16x8_t b = vld1q_s16(input2_ptr + x);
 
-                int32x4_t rf_0{};
-                int32x4_t rf_1{};
+                    const auto af_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1);
+                    const auto af_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1);
+                    const auto bf_0 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2);
+                    const auto bf_1 = vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2);
+
+                    int32x4_t rf_0{};
+                    int32x4_t rf_1{};
 #ifdef __aarch64__
-                rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtnq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #else  //__aarch64__
-                rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
-                rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
+                    rf_0 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_0, bf_0), invvscaleo));
+                    rf_1 = vcvtq_s32_f32(vmulq_f32(vaddq_f32(af_1, bf_1), invvscaleo));
 #endif //__aarch64__
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
-                vst1q_s16(output_ptr + x, pa);
-            }
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf_0), vqmovn_s32(rf_1));
+                    vst1q_s16(output_ptr + x, pa);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+                    const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs + bfs), dst->info()->quantization_info());
+                }
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/sve/fp16.cpp b/src/cpu/kernels/add/generic/sve/fp16.cpp
index 581f3ab..01dfe6c 100644
--- a/src/cpu/kernels/add/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/add/generic/sve/fp16.cpp

@@ -31,10 +31,11 @@
 {
 namespace cpu
 {
-void add_fp16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp16_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<float16_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/add/generic/sve/fp32.cpp b/src/cpu/kernels/add/generic/sve/fp32.cpp
index b377991..56771a5 100644
--- a/src/cpu/kernels/add/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/add/generic/sve/fp32.cpp

@@ -24,15 +24,17 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/kernels/add/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_fp32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_fp32_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<float>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/sve/impl.cpp b/src/cpu/kernels/add/generic/sve/impl.cpp
index e860643..ca850fc 100644
--- a/src/cpu/kernels/add/generic/sve/impl.cpp
+++ b/src/cpu/kernels/add/generic/sve/impl.cpp

@@ -23,17 +23,21 @@
  */
 
 #include "src/cpu/kernels/add/generic/sve/impl.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_same_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     const auto all_true_pg           = wrapper::svptrue<ScalarType>();
     const auto window_start_x        = static_cast<int>(window.x().start());
@@ -53,7 +57,7 @@
     Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
     Iterator output(dst, window);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -68,28 +72,30 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-
-            const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v) : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
-                svst1(pg, output_ptr + x, res);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
 
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                const ScalarType broadcast_value     = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_value_vec = wrapper::svdup_n(broadcast_value);
+
+                int      x  = window_start_x;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto non_broadcast_v = svld1(pg, non_broadcast_input_ptr + x);
+                    auto       res             = is_sat ? wrapper::svqadd(broadcast_value_vec, non_broadcast_v)
+                                                        : svadd_z(pg, broadcast_value_vec, non_broadcast_v);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -101,35 +107,41 @@
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto val1 = svld1(pg, input1_ptr + x);
-                const auto val2 = svld1(pg, input2_ptr + x);
-                const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
-                svst1(pg, output_ptr + x, res);
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
 
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                int      x  = window_start_x;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto val1 = svld1(pg, input1_ptr + x);
+                    const auto val2 = svld1(pg, input2_ptr + x);
+                    const auto res  = is_sat ? wrapper::svqadd(val1, val2) : svadd_z(pg, val1, val2);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
-template void add_same_sve<float>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<uint8_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
-template void add_same_sve<int32_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<uint8_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int16_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<int32_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-template void add_same_sve<float16_t>(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+template void add_same_sve<float16_t>(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 #endif /* (__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/sve/impl.h b/src/cpu/kernels/add/generic/sve/impl.h
index 0136f14..6a95d66 100644
--- a/src/cpu/kernels/add/generic/sve/impl.h
+++ b/src/cpu/kernels/add/generic/sve/impl.h

@@ -33,7 +33,8 @@
 namespace cpu
 {
 template <typename ScalarType>
-void add_same_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
+void add_same_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif // SRC_CORE_SVE_KERNELS_ADD_IMPL_H

diff --git a/src/cpu/kernels/add/generic/sve/integer.cpp b/src/cpu/kernels/add/generic/sve/integer.cpp
index 3642dcc..4d17f2a 100644
--- a/src/cpu/kernels/add/generic/sve/integer.cpp
+++ b/src/cpu/kernels/add/generic/sve/integer.cpp

@@ -24,25 +24,29 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/kernels/add/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_u8_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_u8_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<uint8_t>(src0, src1, dst, policy, window);
 }
 
-void add_s16_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s16_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<int16_t>(src0, src1, dst, policy, window);
 }
 
-void add_s32_sve(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_s32_sve(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     return add_same_sve<int32_t>(src0, src1, dst, policy, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
index 1dec214..40add9d 100644
--- a/src/cpu/kernels/add/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8.cpp

@@ -26,15 +26,18 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -58,7 +61,7 @@
     const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
     const auto voffseto   = svdup_n_f32(oq_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -78,48 +81,89 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const uint8_t   broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))), voffset2)), vscale2);
-
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
 
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
+                const uint8_t   broadcast_value     = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+                const svuint8_t broadcast_value_vec = svdup_n_u8(broadcast_value);
 
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
 
-                const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+                const auto bf_0 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_1 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_2 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
+                const auto bf_3 = svmul_f32_z(
+                    pg,
+                    svcvt_f32_s32_z(
+                        pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(broadcast_value_vec))),
+                                        voffset2)),
+                    vscale2);
 
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
-                svst1_u8(pg, output_ptr + x, res);
+                do
+                {
+                    const svuint8_t a = svld1_u8(pg, non_broadcast_input_ptr + x);
 
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto af_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+
+                    const auto rf_0 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+                    const auto pb = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+
+                    const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+                    svst1_u8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -136,45 +180,82 @@
         const auto voffset1 = svdup_n_s32(iq1_info.offset);
         const auto voffset2 = svdup_n_s32(iq2_info.offset);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a    = svld1_u8(pg, input1_ptr + x);
-                const auto b    = svld1_u8(pg, input2_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)), vscale1);
+                const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
 
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)), vscale2);
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
+                do
+                {
+                    const auto a    = svld1_u8(pg, input1_ptr + x);
+                    const auto b    = svld1_u8(pg, input2_ptr + x);
+                    const auto af_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(a))), voffset1)),
+                        vscale1);
 
-                const auto rf_0 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+                    const auto bf_0 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlb_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_1 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlb_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_2 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlb_u32(svmovlt_u16(b))), voffset2)),
+                        vscale2);
+                    const auto bf_3 = svmul_f32_z(
+                        pg,
+                        svcvt_f32_s32_z(pg,
+                                        svsub_s32_z(pg, svreinterpret_s32_u32(svmovlt_u32(svmovlt_u16(b))), voffset2)),
+                        vscale2);
 
-                const auto pa  = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
-                const auto pb  = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
-                const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
+                    const auto rf_0 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_u32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
 
-                svst1_u8(pg, output_ptr + x, res);
+                    const auto pa  = svqxtnt_u32(svqxtnb_u32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_u32(svqxtnb_u32(rf_2), rf_3);
+                    const auto res = svqxtnt_u16(svqxtnb_u16(pa), pb);
 
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                    svst1_u8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu

diff --git a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
index dae8899..2e58511 100644
--- a/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qasymm8_signed.cpp

@@ -26,15 +26,18 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qasymm8_signed_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qasymm8_signed_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -57,7 +60,7 @@
     const auto invvscaleo = svdup_n_f32(1.f / oq_info.scale);
     const auto voffseto   = svdup_n_f32(oq_info.offset);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -78,46 +81,63 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const auto   broadcast_value_vec = svdup_n_s8(broadcast_value);
-
-            int        x    = window_start_x;
-            svbool_t   pg   = svwhilelt_b8(x, window_end_x);
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-            const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)), vscale2);
-
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a    = svld1_s8(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
 
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+                const int8_t broadcast_value     = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const auto   broadcast_value_vec = svdup_n_s8(broadcast_value);
 
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+                int        x    = window_start_x;
+                svbool_t   pg   = svwhilelt_b8(x, window_end_x);
+                const auto bf_0 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_1 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_2 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
+                const auto bf_3 = svmul_f32_z(
+                    pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(broadcast_value_vec)), voffset2)),
+                    vscale2);
 
-                svst1_s8(pg, output_ptr + x, res);
+                do
+                {
+                    const auto a    = svld1_s8(pg, non_broadcast_input_ptr + x);
+                    const auto af_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
 
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    const auto rf_0 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+
+                    const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+                    const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+
+                    svst1_s8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -134,46 +154,59 @@
         const auto voffset1 = svdup_n_s32(iq1_info.offset);
         const auto voffset2 = svdup_n_s32(iq2_info.offset);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b8(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a = svld1_s8(pg, input1_ptr + x);
-                const auto b = svld1_s8(pg, input2_ptr + x);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
 
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
-                const auto af_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
-                const auto af_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b8(x, window_end_x);
+                do
+                {
+                    const auto a = svld1_s8(pg, input1_ptr + x);
+                    const auto b = svld1_s8(pg, input2_ptr + x);
 
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
-                const auto bf_2 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
-                const auto bf_3 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
+                    const auto af_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(a)), voffset1)), vscale1);
+                    const auto af_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(a)), voffset1)), vscale1);
+                    const auto af_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(a)), voffset1)), vscale1);
 
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
-                const auto rf_2 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
-                const auto rf_3 = svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
+                    const auto bf_0 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlb_s16(b)), voffset2)), vscale2);
+                    const auto bf_1 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlb_s16(b)), voffset2)), vscale2);
+                    const auto bf_2 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlb_s32(svmovlt_s16(b)), voffset2)), vscale2);
+                    const auto bf_3 = svmul_f32_z(
+                        pg, svcvt_f32_s32_z(pg, svsub_s32_z(pg, svmovlt_s32(svmovlt_s16(b)), voffset2)), vscale2);
 
-                const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
-                const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
+                    const auto rf_0 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto rf_2 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_2, bf_2), invvscaleo));
+                    const auto rf_3 =
+                        svcvt_s32_f32_z(pg, svmla_f32_z(pg, voffseto, svadd_f32_z(pg, af_3, bf_3), invvscaleo));
 
-                svst1_s8(pg, output_ptr + x, res);
+                    const auto pa  = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    const auto pb  = svqxtnt_s32(svqxtnb_s32(rf_2), rf_3);
+                    const auto res = svqxtnt_s16(svqxtnb_s16(pa), pb);
 
-                x += svcntb();
-                pg = svwhilelt_b8(x, window_end_x);
-            }
-            while(svptest_any(svptrue_b8(), pg));
-        },
-        input1, input2, output);
+                    svst1_s8(pg, output_ptr + x, res);
+
+                    x += svcntb();
+                    pg = svwhilelt_b8(x, window_end_x);
+                } while (svptest_any(svptrue_b8(), pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu

diff --git a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
index 8c48ded..17a42c2 100644
--- a/src/cpu/kernels/add/generic/sve2/qsymm16.cpp
+++ b/src/cpu/kernels/add/generic/sve2/qsymm16.cpp

@@ -26,15 +26,18 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/SVEMath.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_qsymm16_sve2(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void add_qsymm16_sve2(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -59,7 +62,7 @@
     const auto invvscaleo  = svdup_n_f32(1.f / oq_info.scale);
     const auto all_true_pg = svptrue_b16();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -74,39 +77,40 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const auto    broadcast_value_vec = svdup_n_s16(broadcast_value);
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
-
-            const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
-            const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
-
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a    = svld1_s16(pg, non_broadcast_input_ptr + x);
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
 
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                const int16_t broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const auto    broadcast_value_vec = svdup_n_s16(broadcast_value);
 
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b16(x, window_end_x);
 
-                svst1_s16(pg, output_ptr + x, res);
+                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(broadcast_value_vec)), vscale2);
+                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(broadcast_value_vec)), vscale2);
 
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                do
+                {
+                    const auto a    = svld1_s16(pg, non_broadcast_input_ptr + x);
+                    const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+                    const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+
+                    const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+
+                    const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+
+                    svst1_s16(pg, output_ptr + x, res);
+
+                    x += svcnth();
+                    pg = svwhilelt_b16(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -118,37 +122,38 @@
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            int      x  = window_start_x;
-            svbool_t pg = svwhilelt_b16(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                auto a = svld1_s16(pg, input1_ptr + x);
-                auto b = svld1_s16(pg, input2_ptr + x);
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
 
-                const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
-                const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
+                int      x  = window_start_x;
+                svbool_t pg = svwhilelt_b16(x, window_end_x);
+                do
+                {
+                    auto a = svld1_s16(pg, input1_ptr + x);
+                    auto b = svld1_s16(pg, input2_ptr + x);
 
-                const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
-                const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
+                    const auto af_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(a)), vscale1);
+                    const auto af_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(a)), vscale1);
 
-                const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
-                const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
+                    const auto bf_0 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlb_s32(b)), vscale2);
+                    const auto bf_1 = svmul_f32_z(pg, svcvt_f32_s32_z(pg, svmovlt_s32(b)), vscale2);
 
-                const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
-                svst1_s16(pg, output_ptr + x, res);
+                    const auto rf_0 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_0, bf_0), invvscaleo));
+                    const auto rf_1 = svcvt_s32_f32_z(pg, svmul_f32_z(pg, svadd_f32_z(pg, af_1, bf_1), invvscaleo));
 
-                x += svcnth();
-                pg = svwhilelt_b16(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                    const auto res = svqxtnt_s32(svqxtnb_s32(rf_0), rf_1);
+                    svst1_s16(pg, output_ptr + x, res);
+
+                    x += svcnth();
+                    pg = svwhilelt_b16(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu

diff --git a/src/cpu/kernels/add/list.h b/src/cpu/kernels/add/list.h
index 7cdb70f..1040c39 100644
--- a/src/cpu/kernels/add/list.h
+++ b/src/cpu/kernels/add/list.h

@@ -31,8 +31,9 @@
 {
 namespace cpu
 {
-#define DECLARE_ADD_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+#define DECLARE_ADD_KERNEL(func_name)                                                                   \
+    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+                   const Window &window)
 
 DECLARE_ADD_KERNEL(add_qasymm8_neon);
 DECLARE_ADD_KERNEL(add_qasymm8_signed_neon);
@@ -55,4 +56,4 @@
 
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_KERNELS_ADD_LIST_H
\ No newline at end of file
+#endif // SRC_CORE_KERNELS_ADD_LIST_H

diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
index d8e5f69..b4b81aa 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp16.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/CpuTypes.h"
 
 #include <cstddef>
@@ -38,16 +39,20 @@
 {
 using arm_compute::float16_t;
 
-void a64_add_bn_clamp_direct_fp16_2x32(
-    float16_t *out, size_t out_stride,
-    float16_t *out_direct, size_t out_direct_stride,
-    const float16_t *in0, size_t in0_stride,
-    const float16_t *in1, size_t in1_stride,
-    const float16_t *bn_mul,
-    const float16_t *bn_add,
-    const float16_t  minval,
-    const float16_t  maxval,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_fp16_2x32(float16_t       *out,
+                                       size_t           out_stride,
+                                       float16_t       *out_direct,
+                                       size_t           out_direct_stride,
+                                       const float16_t *in0,
+                                       size_t           in0_stride,
+                                       const float16_t *in1,
+                                       size_t           in1_stride,
+                                       const float16_t *bn_mul,
+                                       const float16_t *bn_add,
+                                       const float16_t  minval,
+                                       const float16_t  maxval,
+                                       size_t           width,
+                                       size_t           height)
 {
     struct KernelArgs
     {
@@ -858,9 +863,14 @@
         "subs x20, x20, #0x2\n"
         "bgt 8b\n"
         "58:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
 
 } // namespace
@@ -869,8 +879,15 @@
 {
 namespace cpu
 {
-void add_mul_add_fp16_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                           ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_fp16_neon(const ITensor             *input1,
+                           const ITensor             *input2,
+                           const ITensor             *bn_mul,
+                           const ITensor             *bn_add,
+                           ITensor                   *add_output,
+                           ITensor                   *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info,
+                           const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -882,16 +899,16 @@
     float16_t minval = std::numeric_limits<half>::lowest();
     float16_t maxval = std::numeric_limits<half>::max();
 
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = static_cast<float16_t>(0.f);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = static_cast<float16_t>(0.f);
         maxval = static_cast<float16_t>(act_info.a());
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = static_cast<float16_t>(act_info.b());
         maxval = static_cast<float16_t>(act_info.a());
@@ -909,42 +926,37 @@
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp16_2x32(
-                reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
-                reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float16_t *>(bn_mul->buffer()),
-                reinterpret_cast<float16_t *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
+                                                  reinterpret_cast<float16_t *>(add_out_it.ptr()), out_direct_stride,
+                                                  reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
+                                                  reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+                                                  reinterpret_cast<float16_t *>(bn_mul->buffer()),
+                                                  reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+                                                  width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp16_2x32(
-                reinterpret_cast<float16_t *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<float16_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float16_t *>(bn_mul->buffer()),
-                reinterpret_cast<float16_t *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp16_2x32(reinterpret_cast<float16_t *>(out_it.ptr()), out_stride, nullptr,
+                                                  out_direct_stride, reinterpret_cast<float16_t *>(in1_it.ptr()),
+                                                  in0_stride, reinterpret_cast<float16_t *>(in2_it.ptr()), in1_stride,
+                                                  reinterpret_cast<float16_t *>(bn_mul->buffer()),
+                                                  reinterpret_cast<float16_t *>(bn_add->buffer()), minval, maxval,
+                                                  width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu

diff --git a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
index b0c487e..f0444b6 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/fp32.cpp

@@ -35,16 +35,20 @@
 #ifdef __aarch64__
 namespace
 {
-void a64_add_bn_clamp_direct_fp32_2x16(
-    float *out, size_t out_stride,
-    float *out_direct, size_t out_direct_stride,
-    const float *in0, size_t in0_stride,
-    const float *in1, size_t in1_stride,
-    const float *bn_mul,
-    const float *bn_add,
-    const float  minval,
-    const float  maxval,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_fp32_2x16(float       *out,
+                                       size_t       out_stride,
+                                       float       *out_direct,
+                                       size_t       out_direct_stride,
+                                       const float *in0,
+                                       size_t       in0_stride,
+                                       const float *in1,
+                                       size_t       in1_stride,
+                                       const float *bn_mul,
+                                       const float *bn_add,
+                                       const float  minval,
+                                       const float  maxval,
+                                       size_t       width,
+                                       size_t       height)
 {
     struct KernelArgs
     {
@@ -631,18 +635,30 @@
         "subs x20, x20, #0x2\n"
         "bgt 8b\n"
         "34:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
-}
+} // namespace
 
 namespace arm_compute
 {
 namespace cpu
 {
-void add_mul_add_fp32_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                           ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_fp32_neon(const ITensor             *input1,
+                           const ITensor             *input2,
+                           const ITensor             *bn_mul,
+                           const ITensor             *bn_add,
+                           ITensor                   *add_output,
+                           ITensor                   *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info,
+                           const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -654,16 +670,16 @@
     float minval = std::numeric_limits<float>::lowest();
     float maxval = std::numeric_limits<float>::max();
 
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = 0.f;
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = 0.f;
         maxval = act_info.a();
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = act_info.b();
         maxval = act_info.a();
@@ -681,42 +697,34 @@
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp32_2x16(
-                reinterpret_cast<float *>(out_it.ptr()), out_stride,
-                reinterpret_cast<float *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float *>(bn_mul->buffer()),
-                reinterpret_cast<float *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp32_2x16(
+                    reinterpret_cast<float *>(out_it.ptr()), out_stride, reinterpret_cast<float *>(add_out_it.ptr()),
+                    out_direct_stride, reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
+                    reinterpret_cast<float *>(in2_it.ptr()), in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+                    reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_fp32_2x16(
-                reinterpret_cast<float *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<float *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<float *>(in2_it.ptr()), in1_stride,
-                reinterpret_cast<float *>(bn_mul->buffer()),
-                reinterpret_cast<float *>(bn_add->buffer()),
-                minval,
-                maxval,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_fp32_2x16(
+                    reinterpret_cast<float *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<float *>(in1_it.ptr()), in0_stride, reinterpret_cast<float *>(in2_it.ptr()),
+                    in1_stride, reinterpret_cast<float *>(bn_mul->buffer()),
+                    reinterpret_cast<float *>(bn_add->buffer()), minval, maxval, width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu

diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
index f7448a6..035805c 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8.cpp

@@ -36,22 +36,30 @@
 #ifdef __aarch64__
 namespace
 {
-void a64_add_bn_clamp_direct_u8_fp32_2x16(
-    uint8_t *out, size_t out_stride,
-    uint8_t *out_direct, size_t out_direct_stride,
-    const uint8_t *in0, size_t in0_stride,
-    const uint8_t *in1, size_t in1_stride,
-    const float *bn_mul,
-    const float *bn_add,
-    const uint8_t minval,
-    const uint8_t maxval,
-    int32_t out_zeropt, float out_scale,
-    int32_t out_direct_zeropt, float out_direct_scale,
-    int32_t in0_zeropt, float in0_scale,
-    int32_t in1_zeropt, float in1_scale,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_u8_fp32_2x16(uint8_t       *out,
+                                          size_t         out_stride,
+                                          uint8_t       *out_direct,
+                                          size_t         out_direct_stride,
+                                          const uint8_t *in0,
+                                          size_t         in0_stride,
+                                          const uint8_t *in1,
+                                          size_t         in1_stride,
+                                          const float   *bn_mul,
+                                          const float   *bn_add,
+                                          const uint8_t  minval,
+                                          const uint8_t  maxval,
+                                          int32_t        out_zeropt,
+                                          float          out_scale,
+                                          int32_t        out_direct_zeropt,
+                                          float          out_direct_scale,
+                                          int32_t        in0_zeropt,
+                                          float          in0_scale,
+                                          int32_t        in1_zeropt,
+                                          float          in1_scale,
+                                          size_t         width,
+                                          size_t         height)
 {
-    float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
+    float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
     struct KernelArgs
     {
         const float *scales;
@@ -709,9 +717,19 @@
         "subs x23, x23, #0x2\n"
         "bgt 6b\n"
         "32:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+          [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+          [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+          [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+          [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
 
 } // namespace
@@ -720,8 +738,15 @@
 {
 namespace cpu
 {
-void add_mul_add_u8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                         ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_u8_neon(const ITensor             *input1,
+                         const ITensor             *input2,
+                         const ITensor             *bn_mul,
+                         const ITensor             *bn_add,
+                         ITensor                   *add_output,
+                         ITensor                   *final_output,
+                         ConvertPolicy              policy,
+                         const ActivationLayerInfo &act_info,
+                         const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -739,24 +764,25 @@
     uint8_t maxval = std::numeric_limits<uint8_t>::max();
 
     const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = quantize_qasymm8(0.f, final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = quantize_qasymm8(0.f, final_output_qinfo);
         maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = quantize_qasymm8(act_info.b(), final_output_qinfo);
         maxval = quantize_qasymm8(act_info.a(), final_output_qinfo);
     }
 
-    const UniformQuantizationInfo in1_qinfo        = input1_info->quantization_info().uniform();
-    const UniformQuantizationInfo in2_qinfo        = input2_info->quantization_info().uniform();
-    const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+    const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+    const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+    const UniformQuantizationInfo add_output_qinfo =
+        (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
 
     const int32_t in1_offset        = in1_qinfo.offset;
     const int32_t in2_offset        = in2_qinfo.offset;
@@ -783,50 +809,35 @@
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_u8_fp32_2x16(
-                reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
-                reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_u8_fp32_2x16(
+                    reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
+                    reinterpret_cast<uint8_t *>(add_out_it.ptr()), out_direct_stride,
+                    reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_u8_fp32_2x16(
-                reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<uint8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_u8_fp32_2x16(
+                    reinterpret_cast<uint8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<uint8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<uint8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu

diff --git a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
index 1ae2cb7..e1a45b4 100644
--- a/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/addmuladd/generic/neon/qasymm8_signed.cpp

@@ -36,22 +36,30 @@
 #ifdef __aarch64__
 namespace
 {
-void a64_add_bn_clamp_direct_s8_fp32_2x16(
-    int8_t *out, size_t out_stride,
-    int8_t *out_direct, size_t out_direct_stride,
-    const int8_t *in0, size_t in0_stride,
-    const int8_t *in1, size_t in1_stride,
-    const float *bn_mul,
-    const float *bn_add,
-    const int8_t minval,
-    const int8_t maxval,
-    int32_t out_zeropt, float out_scale,
-    int32_t out_direct_zeropt, float out_direct_scale,
-    int32_t in0_zeropt, float in0_scale,
-    int32_t in1_zeropt, float in1_scale,
-    size_t width, size_t height)
+void a64_add_bn_clamp_direct_s8_fp32_2x16(int8_t       *out,
+                                          size_t        out_stride,
+                                          int8_t       *out_direct,
+                                          size_t        out_direct_stride,
+                                          const int8_t *in0,
+                                          size_t        in0_stride,
+                                          const int8_t *in1,
+                                          size_t        in1_stride,
+                                          const float  *bn_mul,
+                                          const float  *bn_add,
+                                          const int8_t  minval,
+                                          const int8_t  maxval,
+                                          int32_t       out_zeropt,
+                                          float         out_scale,
+                                          int32_t       out_direct_zeropt,
+                                          float         out_direct_scale,
+                                          int32_t       in0_zeropt,
+                                          float         in0_scale,
+                                          int32_t       in1_zeropt,
+                                          float         in1_scale,
+                                          size_t        width,
+                                          size_t        height)
 {
-    float scales[4] = { in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale };
+    float scales[4] = {in0_scale, in1_scale, 1.0f / out_scale, 1.0f / out_direct_scale};
     struct KernelArgs
     {
         const float *scales;
@@ -709,9 +717,19 @@
         "subs x23, x23, #0x2\n"
         "bgt 6b\n"
         "32:" // odd columns skip
-        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out), [out_direct] "+&r"(out_direct), [width] "+&r"(width)
-        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride), [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)), [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)), [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)), [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)), [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)), [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride), [out_stride] "r"(out_stride)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9", "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
+        : [bn_add] "+&r"(bn_add), [bn_mul] "+&r"(bn_mul), [in0] "+&r"(in0), [in1] "+&r"(in1), [out] "+&r"(out),
+          [out_direct] "+&r"(out_direct), [width] "+&r"(width)
+        : [args_ptr] "r"(&ka), [height] "r"(height), [in0_stride] "r"(in0_stride), [in1_stride] "r"(in1_stride),
+          [offsetof_in0_zeropt] "I"(offsetof(KernelArgs, in0_zeropt)),
+          [offsetof_in1_zeropt] "I"(offsetof(KernelArgs, in1_zeropt)),
+          [offsetof_maxval] "I"(offsetof(KernelArgs, maxval)), [offsetof_minval] "I"(offsetof(KernelArgs, minval)),
+          [offsetof_out_direct_zeropt] "I"(offsetof(KernelArgs, out_direct_zeropt)),
+          [offsetof_out_zeropt] "I"(offsetof(KernelArgs, out_zeropt)),
+          [offsetof_scales] "I"(offsetof(KernelArgs, scales)), [out_direct_stride] "r"(out_direct_stride),
+          [out_stride] "r"(out_stride)
+        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16",
+          "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x9",
+          "x10", "x11", "x12", "x20", "x21", "x22", "x23", "x24", "x25", "x26", "x27", "x28");
 }
 
 } // namespace
@@ -720,8 +738,15 @@
 {
 namespace cpu
 {
-void add_mul_add_s8_neon(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add,
-                         ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+void add_mul_add_s8_neon(const ITensor             *input1,
+                         const ITensor             *input2,
+                         const ITensor             *bn_mul,
+                         const ITensor             *bn_add,
+                         ITensor                   *add_output,
+                         ITensor                   *final_output,
+                         ConvertPolicy              policy,
+                         const ActivationLayerInfo &act_info,
+                         const Window              &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -739,24 +764,25 @@
     int8_t maxval = std::numeric_limits<int8_t>::max();
 
     const UniformQuantizationInfo final_output_qinfo = final_output_info->quantization_info().uniform();
-    if(act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
+    if (act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU)
     {
         minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU)
     {
         minval = quantize_qasymm8_signed(0.f, final_output_qinfo);
         maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
     }
-    else if(act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
+    else if (act_info.activation() == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
     {
         minval = quantize_qasymm8_signed(act_info.b(), final_output_qinfo);
         maxval = quantize_qasymm8_signed(act_info.a(), final_output_qinfo);
     }
 
-    const UniformQuantizationInfo in1_qinfo        = input1_info->quantization_info().uniform();
-    const UniformQuantizationInfo in2_qinfo        = input2_info->quantization_info().uniform();
-    const UniformQuantizationInfo add_output_qinfo = (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
+    const UniformQuantizationInfo in1_qinfo = input1_info->quantization_info().uniform();
+    const UniformQuantizationInfo in2_qinfo = input2_info->quantization_info().uniform();
+    const UniformQuantizationInfo add_output_qinfo =
+        (add_output != nullptr) ? add_output_info->quantization_info().uniform() : UniformQuantizationInfo();
 
     const int32_t in1_offset        = in1_qinfo.offset;
     const int32_t in2_offset        = in2_qinfo.offset;
@@ -783,50 +809,35 @@
     const size_t width  = window.num_iterations(0);
     const size_t height = window.num_iterations(1);
 
-    if(add_output != nullptr)
+    if (add_output != nullptr)
     {
         Iterator add_out_it(add_output, window);
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_s8_fp32_2x16(
-                reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
-                reinterpret_cast<int8_t *>(add_out_it.ptr()), out_direct_stride,
-                reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, add_out_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_s8_fp32_2x16(
+                    reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, reinterpret_cast<int8_t *>(add_out_it.ptr()),
+                    out_direct_stride, reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
+                    reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval,
+                    out_offset, out_scale, out_direct_offset, out_direct_scale, in1_offset, in1_scale, in2_offset,
+                    in2_scale, width, height);
+            },
+            in1_it, in2_it, add_out_it, out_it);
     }
     else
     {
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            a64_add_bn_clamp_direct_s8_fp32_2x16(
-                reinterpret_cast<int8_t *>(out_it.ptr()), out_stride,
-                nullptr, out_direct_stride,
-                reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride,
-                reinterpret_cast<int8_t *>(in2_it.ptr()), in1_stride,
-                bn_mul_buffer,
-                bn_add_buffer,
-                minval,
-                maxval,
-                out_offset, out_scale,
-                out_direct_offset, out_direct_scale,
-                in1_offset, in1_scale,
-                in2_offset, in2_scale,
-                width, height);
-        },
-        in1_it, in2_it, out_it);
+            win,
+            [&](const Coordinates &)
+            {
+                a64_add_bn_clamp_direct_s8_fp32_2x16(
+                    reinterpret_cast<int8_t *>(out_it.ptr()), out_stride, nullptr, out_direct_stride,
+                    reinterpret_cast<int8_t *>(in1_it.ptr()), in0_stride, reinterpret_cast<int8_t *>(in2_it.ptr()),
+                    in1_stride, bn_mul_buffer, bn_add_buffer, minval, maxval, out_offset, out_scale, out_direct_offset,
+                    out_direct_scale, in1_offset, in1_scale, in2_offset, in2_scale, width, height);
+            },
+            in1_it, in2_it, out_it);
     }
 }
 } // namespace cpu

diff --git a/src/cpu/kernels/addmuladd/list.h b/src/cpu/kernels/addmuladd/list.h
index a7c22c0..568003a 100644
--- a/src/cpu/kernels/addmuladd/list.h
+++ b/src/cpu/kernels/addmuladd/list.h

@@ -32,9 +32,10 @@
 {
 namespace cpu
 {
-#define DECLARE_ADD_MUL_ADD_KERNEL(func_name)                                                                          \
+#define DECLARE_ADD_MUL_ADD_KERNEL(func_name)                                                                  \
     void func_name(const ITensor *input1, const ITensor *input2, const ITensor *bn_mul, const ITensor *bn_add, \
-                   ITensor *add_output, ITensor *final_output, ConvertPolicy policy, const ActivationLayerInfo &act_info, const Window &window)
+                   ITensor *add_output, ITensor *final_output, ConvertPolicy policy,                           \
+                   const ActivationLayerInfo &act_info, const Window &window)
 
 DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp32_neon);
 DECLARE_ADD_MUL_ADD_KERNEL(add_mul_add_fp16_neon);

diff --git a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
index 10bf8e4..6e8f32e 100644
--- a/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/NEON/INEKernel.h"
 #include "src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp"
 
@@ -57,13 +58,12 @@
 public:
     /** Constructor
      */
-    CpuGemmAssemblyWrapperKernel()
-        : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
+    CpuGemmAssemblyWrapperKernel() : _kernel(nullptr), _name("CpuGemmAssemblyWrapperKernel")
     {
     }
 
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)  = delete;
-    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&) = default;
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &)            = delete;
+    CpuGemmAssemblyWrapperKernel(CpuGemmAssemblyWrapperKernel &&)           = default;
     CpuGemmAssemblyWrapperKernel &operator=(CpuGemmAssemblyWrapperKernel &) = delete;
 
     const char *name() const override
@@ -110,7 +110,7 @@
 
         INEKernel::configure(win);
 
-        if(!kernel_name_tag.empty())
+        if (!kernel_name_tag.empty())
         {
             _name += "/" + kernel_name_tag;
         }
@@ -132,7 +132,7 @@
 
 private:
     arm_gemm::GemmCommon<TypeInput, TypeOutput> *_kernel;
-    std::string _name;
+    std::string                                  _name;
 };
 } // namespace kernel
 } // namespace cpu

diff --git a/src/cpu/kernels/assembly/arm_gemm.hpp b/src/cpu/kernels/assembly/arm_gemm.hpp
index 4c127b4..9a913c5 100644
--- a/src/cpu/kernels/assembly/arm_gemm.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm.hpp

@@ -23,13 +23,12 @@
  */
 #pragma once
 
+#include "arm_gemm_local.hpp"
+#include "gemm_common.hpp"
 #include <cstring>
 #include <memory>
 #include <vector>
 
-#include "arm_gemm_local.hpp"
-#include "gemm_common.hpp"
-
 namespace arm_gemm
 {
 enum class GemmMethod
@@ -111,8 +110,7 @@
     unsigned int outer_block_size = 0;
     WeightFormat weight_format    = WeightFormat::ANY;
 
-    GemmConfig(GemmMethod method)
-        : method(method)
+    GemmConfig(GemmMethod method) : method(method)
     {
     }
     GemmConfig()
@@ -133,8 +131,7 @@
     float param1;
     float param2;
 
-    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
-        : type(type), param1(p1), param2(p2)
+    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f) : type(type), param1(p1), param2(p2)
     {
     }
 };
@@ -156,12 +153,32 @@
     bool              _fast_mode;
     const GemmConfig *_cfg;
 
-    GemmArgs(const CPUInfo *ci, unsigned int M, unsigned int N,
-             unsigned int K, unsigned int Ksections, unsigned int nbatches,
-             unsigned int nmulti, bool indirect_input, Activation act, const int maxthreads,
-             bool fixed_format = false, bool fast_mode = false, const GemmConfig *cfg = nullptr)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _Ksections(Ksections), _nbatches(nbatches), _nmulti(nmulti), _indirect_input(indirect_input), _act(act), _maxthreads(maxthreads),
-          _fixed_format(fixed_format), _fast_mode(fast_mode), _cfg(cfg)
+    GemmArgs(const CPUInfo    *ci,
+             unsigned int      M,
+             unsigned int      N,
+             unsigned int      K,
+             unsigned int      Ksections,
+             unsigned int      nbatches,
+             unsigned int      nmulti,
+             bool              indirect_input,
+             Activation        act,
+             const int         maxthreads,
+             bool              fixed_format = false,
+             bool              fast_mode    = false,
+             const GemmConfig *cfg          = nullptr)
+        : _ci(ci),
+          _Msize(M),
+          _Nsize(N),
+          _Ksize(K),
+          _Ksections(Ksections),
+          _nbatches(nbatches),
+          _nmulti(nmulti),
+          _indirect_input(indirect_input),
+          _act(act),
+          _maxthreads(maxthreads),
+          _fixed_format(fixed_format),
+          _fast_mode(fast_mode),
+          _cfg(cfg)
     {
     }
 };
@@ -187,23 +204,51 @@
     Requantize32() = default;
 
     // Constructor for per-tensor quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
-                 int32_t requant_shift, int32_t requant_mul, int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
-          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)), per_layer_mul(requant_mul), minval(minv), maxval(maxv)
+    Requantize32(const int32_t *bias,
+                 size_t         bias_multi_stride,
+                 int32_t        a_offset,
+                 int32_t        b_offset,
+                 int32_t        c_offset,
+                 int32_t        requant_shift,
+                 int32_t        requant_mul,
+                 int32_t        minv,
+                 int32_t        maxv)
+        : bias(bias),
+          bias_multi_stride(bias_multi_stride),
+          a_offset(a_offset),
+          b_offset(b_offset),
+          c_offset(c_offset),
+          per_channel_requant(false),
+          per_layer_left_shift(std::max<int32_t>(requant_shift, 0)),
+          per_layer_right_shift(std::min<int32_t>(requant_shift, 0)),
+          per_layer_mul(requant_mul),
+          minval(minv),
+          maxval(maxv)
     {
     }
 
     // Constructor for per-channel quantization
-    Requantize32(const int32_t *bias, size_t bias_multi_stride,
-                 int32_t a_offset, int32_t b_offset, int32_t c_offset,
+    Requantize32(const int32_t *bias,
+                 size_t         bias_multi_stride,
+                 int32_t        a_offset,
+                 int32_t        b_offset,
+                 int32_t        c_offset,
                  const int32_t *requant_left_shifts,
                  const int32_t *requant_right_shifts,
                  const int32_t *requant_muls,
-                 int32_t minv, int32_t maxv)
-        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_left_shifts(requant_left_shifts),
-          per_channel_right_shifts(requant_right_shifts), per_channel_muls(requant_muls), minval(minv), maxval(maxv)
+                 int32_t        minv,
+                 int32_t        maxv)
+        : bias(bias),
+          bias_multi_stride(bias_multi_stride),
+          a_offset(a_offset),
+          b_offset(b_offset),
+          c_offset(c_offset),
+          per_channel_requant(true),
+          per_channel_left_shifts(requant_left_shifts),
+          per_channel_right_shifts(requant_right_shifts),
+          per_channel_muls(requant_muls),
+          minval(minv),
+          maxval(maxv)
     {
     }
 };

diff --git a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
index 718fcd1..0672e89 100644
--- a/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp
+++ b/src/cpu/kernels/assembly/arm_gemm_compute_iface.hpp

@@ -27,7 +27,6 @@
 #include "arm_compute/core/Window.h"
 
 #include "ndrange.hpp"
-
 #include <cassert>
 
 /* This file contains mapping between integral types used in arm_compute and arm_gemm
@@ -38,8 +37,7 @@
 namespace arm_gemm
 {
 //we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
-constexpr std::size_t ndrange_max =
-    arm_compute::Dimensions<unsigned int>::num_max_dimensions;
+constexpr std::size_t ndrange_max = arm_compute::Dimensions<unsigned int>::num_max_dimensions;
 
 using ndrange_t = NDRange<ndrange_max>;
 using ndcoord_t = NDCoordinate<ndrange_max>;
@@ -56,7 +54,7 @@
 {
     arm_compute::Window win;
 
-    for(unsigned int i = 0; i != ndrange_max; ++i)
+    for (unsigned int i = 0; i != ndrange_max; ++i)
     {
         //populate the window with the dimensions of the NDRange
         win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
@@ -75,7 +73,7 @@
 {
     arm_compute::Window win;
 
-    for(unsigned int i = 0; i != ndrange_max; ++i)
+    for (unsigned int i = 0; i != ndrange_max; ++i)
     {
         const auto start = ndc.get_position(i);
         const auto size  = ndc.get_size(i);
@@ -98,15 +96,12 @@
  */
 inline ndrange_t to_ndrange(const arm_compute::Window &win)
 {
-    return
-    {
-        static_cast<unsigned int>(win[0].end() - win[0].start()),
-        static_cast<unsigned int>(win[1].end() - win[1].start()),
-        static_cast<unsigned int>(win[2].end() - win[2].start()),
-        static_cast<unsigned int>(win[3].end() - win[3].start()),
-        static_cast<unsigned int>(win[4].end() - win[4].start()),
-        static_cast<unsigned int>(win[5].end() - win[5].start())
-    };
+    return {static_cast<unsigned int>(win[0].end() - win[0].start()),
+            static_cast<unsigned int>(win[1].end() - win[1].start()),
+            static_cast<unsigned int>(win[2].end() - win[2].start()),
+            static_cast<unsigned int>(win[3].end() - win[3].start()),
+            static_cast<unsigned int>(win[4].end() - win[4].start()),
+            static_cast<unsigned int>(win[5].end() - win[5].start())};
 }
 
 /** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
@@ -116,15 +111,12 @@
  */
 inline ndcoord_t to_ndcoord(const arm_compute::Window &win)
 {
-    return
-    {
-        { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
-        { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
-        { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
-        { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
-        { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
-        { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
-    };
+    return {{static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start())},
+            {static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start())},
+            {static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start())},
+            {static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start())},
+            {static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start())},
+            {static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start())}};
 }
 
 } //namespace arm_gemm

diff --git a/src/cpu/kernels/assembly/gemm_common.hpp b/src/cpu/kernels/assembly/gemm_common.hpp
index 834cd10..6fe9f13 100644
--- a/src/cpu/kernels/assembly/gemm_common.hpp
+++ b/src/cpu/kernels/assembly/gemm_common.hpp

@@ -25,7 +25,6 @@
 
 #include "convolution_parameters.hpp"
 #include "ndrange.hpp"
-
 #include <cstddef>
 
 namespace arm_gemm
@@ -51,10 +50,19 @@
      * appropriately typed pointers.  If B is pretransposed (see below) then
      * the settings for B here are ignored.
      */
-    virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                                    const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                                    void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                                    const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
+    virtual void set_arrays_generic(const void                                   *A,
+                                    const int                                     lda,
+                                    const int                                     A_batch_stride,
+                                    const int                                     A_multi_stride,
+                                    const void                                   *B,
+                                    const int                                     ldb,
+                                    /* batches share B */ const int               B_multi_stride,
+                                    void                                         *C,
+                                    const int                                     ldc,
+                                    const int                                     C_batch_stride,
+                                    const int                                     C_multi_stride,
+                                    const void                                   *bias,
+                                    /* no row or batch stride needed */ const int bias_multi_stride) = 0;
 
     /** @returns an ndrange containing ranges of the compute space which can be
      * broken up and parallelised over
@@ -73,7 +81,7 @@
      * This has an empty default implementation, as GEMMs which don't care
      * about thread count can safely ignore this.
      */
-    virtual void set_nthreads(int) {};
+    virtual void set_nthreads(int){};
 
     /* Whether this GEMM can be dynamically scheduled or not. */
     virtual bool supports_dynamic_scheduling() const
@@ -95,7 +103,7 @@
         return 0;
     }
     /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
-    virtual void set_working_space(void *) {};
+    virtual void set_working_space(void *){};
 
     /*** "Pretransposed" interface (optional) ***/
     /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
@@ -122,7 +130,8 @@
     /* The "real" version of this depends on the templated operand type (see below).  */
     virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
     /* Threaded version with window start/end parameters */
-    virtual void pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
+    virtual void
+    pretranspose_B_array_part_generic(void *, const void *, const int, const int, const size_t, const size_t) = 0;
 
     /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
     virtual void set_pretransposed_B_data(void *)
@@ -186,10 +195,19 @@
 public:
     /* Pass in the pointers to the arrays to be operated on and their
      * strides (templated version with appropriate types). */
-    virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
+    virtual void set_arrays(const To                                     *A,
+                            const int                                     lda,
+                            const int                                     A_batch_stride,
+                            const int                                     A_multi_stride,
+                            const To                                     *B,
+                            const int                                     ldb,
+                            /* batches share B */ const int               B_multi_stride,
+                            Tr                                           *C,
+                            const int                                     ldc,
+                            const int                                     C_batch_stride,
+                            const int                                     C_multi_stride,
+                            const Tr                                     *bias,
+                            /* no row or batch stride needed */ const int bias_multi_stride)
     {
         _Aptr              = A;
         _lda               = lda;
@@ -207,25 +225,33 @@
     }
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
-                            void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
+    void set_arrays_generic(const void                                   *A,
+                            const int                                     lda,
+                            const int                                     A_batch_stride,
+                            const int                                     A_multi_stride,
+                            const void                                   *B,
+                            const int                                     ldb,
+                            /* batches share B */ const int               B_multi_stride,
+                            void                                         *C,
+                            const int                                     ldc,
+                            const int                                     C_batch_stride,
+                            const int                                     C_multi_stride,
+                            const void                                   *bias,
+                            /* no row or batch stride needed */ const int bias_multi_stride) override
     {
-        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
-                   static_cast<const To *>(B), ldb, B_multi_stride,
-                   static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
+        set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride, static_cast<const To *>(B), ldb,
+                   B_multi_stride, static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
                    static_cast<const Tr *>(bias), bias_multi_stride);
     }
 
     /*** "Pretransposed" interface ***/
 
     /* Compute col sums over all columns */
-    virtual void requantize_bias(void *, const To *, const int, const int) {};
+    virtual void requantize_bias(void *, const To *, const int, const int){};
 
     /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
     /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
+    virtual void pretranspose_B_array(void *, const To *, const int, const int){};
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
     void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
@@ -237,12 +263,14 @@
      * The fallback/backwards compatible version of the threaded interface exposes a window size of 1 and
      * just calls the non-threaded functions to do the work.  This is valid as with window size of 1 the only
      * legal values for start and end are 0 and 1 respectively. */
-    virtual void pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
+    virtual void
+    pretranspose_B_array_part(void *out, const To *in, const int row_stride, const int multi_stride, size_t, size_t)
     {
         pretranspose_B_array(out, in, row_stride, multi_stride);
     };
 
-    void pretranspose_B_array_part_generic(void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
+    void pretranspose_B_array_part_generic(
+        void *out, const void *in, const int row_stride, const int multi_stride, size_t start, size_t end) override
     {
         pretranspose_B_array_part(out, static_cast<const To *>(in), row_stride, multi_stride, start, end);
     }

diff --git a/src/cpu/kernels/assembly/ndrange.hpp b/src/cpu/kernels/assembly/ndrange.hpp
index 1c8261a..baccdc0 100644
--- a/src/cpu/kernels/assembly/ndrange.hpp
+++ b/src/cpu/kernels/assembly/ndrange.hpp

@@ -45,8 +45,7 @@
         unsigned int   m_end = 0;
 
     public:
-        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
-            : m_parent(p), m_pos(s), m_end(e)
+        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e)
         {
         }
 
@@ -59,12 +58,12 @@
         {
             unsigned int r = m_pos;
 
-            if(d < (D - 1))
+            if (d < (D - 1))
             {
                 r %= m_parent.m_totalsizes[d];
             }
 
-            if(d > 0)
+            if (d > 0)
             {
                 r /= m_parent.m_totalsizes[d - 1];
             }
@@ -98,9 +97,9 @@
     {
         unsigned int t = 1;
 
-        for(unsigned int i = 0; i < D; i++)
+        for (unsigned int i = 0; i < D; i++)
         {
-            if(m_sizes[i] == 0)
+            if (m_sizes[i] == 0)
             {
                 m_sizes[i] = 1;
             }
@@ -116,14 +115,12 @@
     NDRange(const NDRange &rhs)            = default;
 
     template <typename... T>
-    NDRange(T... ts)
-        : m_sizes{ ts... }
+    NDRange(T... ts) : m_sizes{ts...}
     {
         set_totalsizes();
     }
 
-    NDRange(const std::array<unsigned int, D> &n)
-        : m_sizes(n)
+    NDRange(const std::array<unsigned int, D> &n) : m_sizes(n)
     {
         set_totalsizes();
     }
@@ -163,7 +160,7 @@
         std::array<int_t, N> sizes{};
 
         std::size_t i = 0;
-        for(auto &p : list)
+        for (auto &p : list)
         {
             m_positions[i] = p.first;
             sizes[i++]     = p.second;

diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
index 5661479..dbdec5f 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp16.cpp

@@ -29,7 +29,11 @@
 {
 namespace cpu
 {
-void neon_fp16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_fp16_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 {
     return bounding_box_transform<float16_t>(boxes, pred_boxes, deltas, bbinfo, window);
 }

diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
index 34ff922..0224b34 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/fp32.cpp

@@ -26,7 +26,11 @@
 {
 namespace cpu
 {
-void neon_fp32_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_fp32_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 {
     return bounding_box_transform<float>(boxes, pred_boxes, deltas, bbinfo, window);
 }

diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
index b3ffd0a..5a2939b 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.cpp

@@ -29,7 +29,11 @@
 {
 namespace cpu
 {
-void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void bounding_box_transform_qsymm16(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 
 {
     const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
@@ -41,7 +45,8 @@
     const auto scale_before = bbinfo.scale();
     const auto offset       = (bbinfo.correct_transform_coords() ? 1.f : 0.f);
 
-    auto pred_ptr  = reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
+    auto pred_ptr =
+        reinterpret_cast<uint16_t *>(pred_boxes->buffer() + pred_boxes->info()->offset_first_element_in_bytes());
     auto delta_ptr = reinterpret_cast<uint8_t *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
 
     const auto boxes_qinfo  = boxes->info()->quantization_info().uniform();
@@ -49,41 +54,49 @@
     const auto pred_qinfo   = pred_boxes->info()->quantization_info().uniform();
 
     Iterator box_it(boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
-        const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
-        const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
-        const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
-        const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
-        const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
-        const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
-        const float ctr_x  = (b0 / scale_before) + 0.5f * width;
-        const float ctr_y  = (b1 / scale_before) + 0.5f * height;
-        for(size_t j = 0; j < num_classes; ++j)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
-            const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
-            float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
-            float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
-            // Clip dw and dh
-            dw = std::min(dw, bbinfo.bbox_xform_clip());
-            dh = std::min(dh, bbinfo.bbox_xform_clip());
-            // Determine the predictions
-            const float pred_ctr_x = dx * width + ctr_x;
-            const float pred_ctr_y = dy * height + ctr_y;
-            const float pred_w     = std::exp(dw) * width;
-            const float pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 1] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 2] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f), pred_qinfo);
-            pred_ptr[delta_id + 3] = quantize_qasymm16(scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f), pred_qinfo);
-        }
-    },
-    box_it);
+            const auto  ptr    = reinterpret_cast<uint16_t *>(box_it.ptr());
+            const auto  b0     = dequantize_qasymm16(*ptr, boxes_qinfo);
+            const auto  b1     = dequantize_qasymm16(*(ptr + 1), boxes_qinfo);
+            const auto  b2     = dequantize_qasymm16(*(ptr + 2), boxes_qinfo);
+            const auto  b3     = dequantize_qasymm16(*(ptr + 3), boxes_qinfo);
+            const float width  = (b2 / scale_before) - (b0 / scale_before) + 1.f;
+            const float height = (b3 / scale_before) - (b1 / scale_before) + 1.f;
+            const float ctr_x  = (b0 / scale_before) + 0.5f * width;
+            const float ctr_y  = (b1 / scale_before) + 0.5f * height;
+            for (size_t j = 0; j < num_classes; ++j)
+            {
+                // Extract deltas
+                const size_t delta_id = id.y() * deltas_width + 4u * j;
+                const float  dx       = dequantize_qasymm8(delta_ptr[delta_id], deltas_qinfo) / bbinfo.weights()[0];
+                const float  dy       = dequantize_qasymm8(delta_ptr[delta_id + 1], deltas_qinfo) / bbinfo.weights()[1];
+                float        dw       = dequantize_qasymm8(delta_ptr[delta_id + 2], deltas_qinfo) / bbinfo.weights()[2];
+                float        dh       = dequantize_qasymm8(delta_ptr[delta_id + 3], deltas_qinfo) / bbinfo.weights()[3];
+                // Clip dw and dh
+                dw = std::min(dw, bbinfo.bbox_xform_clip());
+                dh = std::min(dh, bbinfo.bbox_xform_clip());
+                // Determine the predictions
+                const float pred_ctr_x = dx * width + ctr_x;
+                const float pred_ctr_y = dy * height + ctr_y;
+                const float pred_w     = std::exp(dw) * width;
+                const float pred_h     = std::exp(dh) * height;
+                // Store the prediction into the output tensor
+                pred_ptr[delta_id] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_x - 0.5f * pred_w, 0.f, img_w - 1.f), pred_qinfo);
+                pred_ptr[delta_id + 1] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_y - 0.5f * pred_h, 0.f, img_h - 1.f), pred_qinfo);
+                pred_ptr[delta_id + 2] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_x + 0.5f * pred_w - offset, 0.f, img_w - 1.f),
+                    pred_qinfo);
+                pred_ptr[delta_id + 3] = quantize_qasymm16(
+                    scale_after * utility::clamp<float>(pred_ctr_y + 0.5f * pred_h - offset, 0.f, img_h - 1.f),
+                    pred_qinfo);
+            }
+        },
+        box_it);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
index 7f99039..d8013c6 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/impl.h

@@ -30,7 +30,11 @@
 namespace cpu
 {
 template <typename T>
-void bounding_box_transform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void bounding_box_transform(const ITensor           *boxes,
+                            ITensor                 *pred_boxes,
+                            const ITensor           *deltas,
+                            BoundingBoxTransformInfo bbinfo,
+                            const Window            &window)
 {
     const size_t num_classes  = deltas->info()->tensor_shape()[0] >> 2;
     const size_t deltas_width = deltas->info()->tensor_shape()[0];
@@ -46,44 +50,53 @@
     auto delta_ptr = reinterpret_cast<T *>(deltas->buffer() + deltas->info()->offset_first_element_in_bytes());
 
     Iterator box_it(boxes, window);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
-        const auto b0     = *ptr;
-        const auto b1     = *(ptr + 1);
-        const auto b2     = *(ptr + 2);
-        const auto b3     = *(ptr + 3);
-        const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
-        const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
-        const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
-        const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
-        for(size_t j = 0; j < num_classes; ++j)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Extract deltas
-            const size_t delta_id = id.y() * deltas_width + 4u * j;
-            const T      dx       = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
-            const T      dy       = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
-            T            dw       = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
-            T            dh       = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
-            // Clip dw and dh
-            dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
-            dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
-            // Determine the predictions
-            const T pred_ctr_x = dx * width + ctr_x;
-            const T pred_ctr_y = dy * height + ctr_y;
-            const T pred_w     = std::exp(dw) * width;
-            const T pred_h     = std::exp(dh) * height;
-            // Store the prediction into the output tensor
-            pred_ptr[delta_id]     = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 1] = scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
-            pred_ptr[delta_id + 2] = scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
-            pred_ptr[delta_id + 3] = scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
-        }
-    },
-    box_it);
+            const auto ptr    = reinterpret_cast<T *>(box_it.ptr());
+            const auto b0     = *ptr;
+            const auto b1     = *(ptr + 1);
+            const auto b2     = *(ptr + 2);
+            const auto b3     = *(ptr + 3);
+            const T    width  = (b2 / scale_before) - (b0 / scale_before) + T(1.f);
+            const T    height = (b3 / scale_before) - (b1 / scale_before) + T(1.f);
+            const T    ctr_x  = (b0 / scale_before) + T(0.5f) * width;
+            const T    ctr_y  = (b1 / scale_before) + T(0.5f) * height;
+            for (size_t j = 0; j < num_classes; ++j)
+            {
+                // Extract deltas
+                const size_t delta_id = id.y() * deltas_width + 4u * j;
+                const T      dx       = delta_ptr[delta_id] / T(bbinfo.weights()[0]);
+                const T      dy       = delta_ptr[delta_id + 1] / T(bbinfo.weights()[1]);
+                T            dw       = delta_ptr[delta_id + 2] / T(bbinfo.weights()[2]);
+                T            dh       = delta_ptr[delta_id + 3] / T(bbinfo.weights()[3]);
+                // Clip dw and dh
+                dw = std::min(dw, T(bbinfo.bbox_xform_clip()));
+                dh = std::min(dh, T(bbinfo.bbox_xform_clip()));
+                // Determine the predictions
+                const T pred_ctr_x = dx * width + ctr_x;
+                const T pred_ctr_y = dy * height + ctr_y;
+                const T pred_w     = std::exp(dw) * width;
+                const T pred_h     = std::exp(dh) * height;
+                // Store the prediction into the output tensor
+                pred_ptr[delta_id] = scale_after * utility::clamp<T>(pred_ctr_x - T(0.5f) * pred_w, T(0), T(img_w - 1));
+                pred_ptr[delta_id + 1] =
+                    scale_after * utility::clamp<T>(pred_ctr_y - T(0.5f) * pred_h, T(0), T(img_h - 1));
+                pred_ptr[delta_id + 2] =
+                    scale_after * utility::clamp<T>(pred_ctr_x + T(0.5f) * pred_w - offset, T(0), T(img_w - 1));
+                pred_ptr[delta_id + 3] =
+                    scale_after * utility::clamp<T>(pred_ctr_y + T(0.5f) * pred_h - offset, T(0), T(img_h - 1));
+            }
+        },
+        box_it);
 }
 
-void bounding_box_transform_qsymm16(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window);
+void bounding_box_transform_qsymm16(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_BOUNDINGBOXTRANFORM_IMPL_H

diff --git a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
index b27c187..64ef815 100644
--- a/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/boundingboxtransform/generic/neon/qsymm16.cpp

@@ -26,7 +26,11 @@
 {
 namespace cpu
 {
-void neon_qu16_boundingboxtransform(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+void neon_qu16_boundingboxtransform(const ITensor           *boxes,
+                                    ITensor                 *pred_boxes,
+                                    const ITensor           *deltas,
+                                    BoundingBoxTransformInfo bbinfo,
+                                    const Window            &window)
 {
     return bounding_box_transform_qsymm16(boxes, pred_boxes, deltas, bbinfo, window);
 }

diff --git a/src/cpu/kernels/boundingboxtransform/list.h b/src/cpu/kernels/boundingboxtransform/list.h
index 8f06acc..4da725a 100644
--- a/src/cpu/kernels/boundingboxtransform/list.h
+++ b/src/cpu/kernels/boundingboxtransform/list.h

@@ -27,8 +27,9 @@
 {
 namespace cpu
 {
-#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name) \
-    void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, const Window &window)
+#define DECLARE_BOUNDINGBOXTRANFORM_KERNEL(func_name)                                                                 \
+    void func_name(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, BoundingBoxTransformInfo bbinfo, \
+                   const Window &window)
 DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp32_boundingboxtransform);
 DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_fp16_boundingboxtransform);
 DECLARE_BOUNDINGBOXTRANFORM_KERNEL(neon_qu16_boundingboxtransform);

diff --git a/src/cpu/kernels/cast/generic/neon/fp16.cpp b/src/cpu/kernels/cast/generic/neon/fp16.cpp
index 6cd0c85..2897f4b 100644
--- a/src/cpu/kernels/cast/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/cast/generic/neon/fp16.cpp

@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/CPP/CPPTypes.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/cpu/kernels/CpuCastKernel.h"
+
 #include "src/cpu/kernels/cast/list.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 #include "support/SaturateCast.h"
 
 #include "arm_neon.h"
@@ -35,7 +36,8 @@
 {
 namespace cpu
 {
-void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_qasymm8_signed_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -49,42 +51,39 @@
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-        int        x       = window_start_x;
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
+            const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+            int        x       = window_start_x;
 
-            const int16x8x2_t texels =
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vmovl_s8(vget_low_s8(texels_s8)),
-                    vmovl_s8(vget_high_s8(texels_s8))
-                }
-            };
-            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-        }
+                const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                const int16x8x2_t texels = {{vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_s32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -98,44 +97,41 @@
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t texels =
+            const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
-                    vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
-                }
-            };
+                const float32x4x4_t texels = {
+                    {vcvtq_f32_s32(vld1q_s32(src_ptr + x)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
+                     vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)), vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))}};
 
-            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-        }
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp32_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -149,44 +145,40 @@
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const float32x4x4_t texels =
+            const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
+
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vld1q_f32(src_ptr + x),
-                    vld1q_f32(src_ptr + x + 4),
-                    vld1q_f32(src_ptr + x + 8),
-                    vld1q_f32(src_ptr + x + 12)
-                }
-            };
+                const float32x4x4_t texels = {{vld1q_f32(src_ptr + x), vld1q_f32(src_ptr + x + 4),
+                                               vld1q_f32(src_ptr + x + 8), vld1q_f32(src_ptr + x + 12)}};
 
-            vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
-            vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
-        }
+                vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
+                vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
 }
 
-void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_fp16_to_other_dt_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -200,142 +192,133 @@
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
-    switch(_dst->info()->data_type())
+    switch (_dst->info()->data_type())
     {
         case DataType::QASYMM8_SIGNED:
         {
             /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
+                        const float16x8x2_t texels = {{
                             vld1q_f16(src_ptr + x),
                             vld1q_f16(src_ptr + x + 8),
-                        }
-                    };
+                        }};
 
-                    vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
-                }
+                        vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
 
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::QASYMM8:
         case DataType::U8:
         {
             /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
+                        const float16x8x2_t texels = {{
                             vld1q_f16(src_ptr + x),
                             vld1q_f16(src_ptr + x + 8),
-                        }
-                    };
+                        }};
 
-                    vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
-                }
+                        vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])),
+                                                          vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
+                    }
 
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
-                }
-
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::F32:
         {
             /* Up-conversion F16 -> F32 */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
-                            vld1q_f16(src_ptr + x),
-                            vld1q_f16(src_ptr + x + 8)
-                        }
-                    };
-                    vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
-                    vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
-                    vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
-                    vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
-                }
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
+                        vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
+                        vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
+                        vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
+                    }
 
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         case DataType::S32:
         {
             /* Up-conversion F16 -> S32 */
-            execute_window_loop(win, [&](const Coordinates &)
-            {
-                const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
-                const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
-
-                int x = window_start_x;
-                for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            execute_window_loop(
+                win,
+                [&](const Coordinates &)
                 {
-                    const float16x8x2_t texels =
+                    const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
+                    const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
+
+                    int x = window_start_x;
+                    for (; x <= (window_end_x - window_step_x); x += window_step_x)
                     {
-                        {
-                            vld1q_f16(src_ptr + x),
-                            vld1q_f16(src_ptr + x + 8)
-                        }
-                    };
+                        const float16x8x2_t texels = {{vld1q_f16(src_ptr + x), vld1q_f16(src_ptr + x + 8)}};
 
-                    vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
-                    vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
-                    vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
-                    vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
-                }
+                        vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
+                        vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
+                        vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
+                    }
 
-                // Compute left-over elements
-                for(; x < window_end_x; ++x)
-                {
-                    *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
-                }
-            },
-            src, dst);
+                    // Compute left-over elements
+                    for (; x < window_end_x; ++x)
+                    {
+                        *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
+                    }
+                },
+                src, dst);
             break;
         }
         default:
@@ -343,7 +326,8 @@
     }
 }
 
-void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
+void neon_u8_to_fp16_cast(
+    const ITensor *_src, ITensor *_dst, const ThreadInfo &info, ConvertPolicy _policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_UNUSED(_policy);
@@ -357,40 +341,37 @@
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
 
     Iterator src(_src, win);
     Iterator dst(_dst, win);
     /* Up-conversion U8 -> F16 */
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
-        const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
-
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
+            const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
+            const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
 
-            const int16x8x2_t texels =
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                {
-                    vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
-                    vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
-                }
-            };
-            vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
-            vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
-        }
+                const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
-        }
-    },
-    src, dst);
+                const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
+                                             vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
+                vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
+                vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
+            }
+        },
+        src, dst);
     return;
 }
 

diff --git a/src/cpu/kernels/cast/list.h b/src/cpu/kernels/cast/list.h
index ffd82d5..5e634fc 100644
--- a/src/cpu/kernels/cast/list.h
+++ b/src/cpu/kernels/cast/list.h

@@ -27,8 +27,9 @@
 {
 namespace cpu
 {
-#define DECLARE_CAST_KERNEL(func_name) \
-    void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
+#define DECLARE_CAST_KERNEL(func_name)                                                                  \
+    void func_name(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, \
+                   const Window &window)
 
 DECLARE_CAST_KERNEL(neon_fp32_to_fp16_cast);
 DECLARE_CAST_KERNEL(neon_u8_to_fp16_cast);
@@ -41,4 +42,4 @@
 #undef DECLARE_CAST_KERNEL
 } // namespace cpu
 } // namespace arm_compute
-#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H
\ No newline at end of file
+#endif //SRC_CORE_NEON_KERNELS_CAST_LIST_H

diff --git a/src/cpu/kernels/conv3d/neon/list.h b/src/cpu/kernels/conv3d/neon/list.h
index 3bfa124..082c60b 100644
--- a/src/cpu/kernels/conv3d/neon/list.h
+++ b/src/cpu/kernels/conv3d/neon/list.h

@@ -27,8 +27,9 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/conv3d/neon/quantized.h"
 
 namespace arm_compute
@@ -36,7 +37,12 @@
 namespace cpu
 {
 template <typename T>
-void directconv3d_float_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
+void directconv3d_float_neon_ndhwc(const ITensor    *src0,
+                                   const ITensor    *src1,
+                                   const ITensor    *src2,
+                                   ITensor          *dst,
+                                   const Conv3dInfo &conv_info,
+                                   const Window     &window)
 {
     const ITensor *src     = src0;
     const ITensor *weights = src1;
@@ -88,91 +94,104 @@
     Iterator wei(weights, window_w);
 
     const T *biases_ptr = nullptr;
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_ptr = reinterpret_cast<T *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
     }
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-        const int in_d_end_t   = in_d_start_t + kernel_dim_d;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_d_start = std::max(in_d_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-        const int in_d_end   = std::min(in_d_end_t, input_dim_d);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_d_start = in_d_start - in_d_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-        const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
-
-        const int      index_c_out_end = weights->info()->dimension(0);
-        const int      index_c_in_end  = weights->info()->dimension(1);
-        const T *const in_ptr_start    = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n;
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            /*
+            // We are computing the theoretical input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+            const int in_d_end_t   = in_d_start_t + kernel_dim_d;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_d_start = std::max(in_d_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+            const int in_d_end   = std::min(in_d_end_t, input_dim_d);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_d_start = in_d_start - in_d_start_t;
+            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+            const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
+
+            const int      index_c_out_end = weights->info()->dimension(0);
+            const int      index_c_in_end  = weights->info()->dimension(1);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[4] * input_stride_n;
+
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    /*
             * This is the loop in the weights, and it goes along OFM (output feature map)
             */
-            const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            T          out_temp          = static_cast<T>(0);
-            T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
-            for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d)
-            {
-                const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
-                const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
-                    const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
-                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                    const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    T          out_temp          = static_cast<T>(0);
+                    T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
+                    for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+                         ++index_wei_d, ++index_in_d)
                     {
-                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                        int         index_c_in        = 0;
-                        vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                        vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                        for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
-                            index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                        const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
+                        const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
                         {
-                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                            //Load Cin weights
-                            for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+                            const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
                             {
-                                w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c_in        = 0;
+                                vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+                                     index_c_in += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    //Load Cin weights
+                                    for (int k = 0; k < num_elems_read_per_iteration;
+                                         ++k, weights_ptr_mover += index_c_out_end)
+                                    {
+                                        w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                    }
+                                    out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                                }
+                                out_temp += vreduce(out_temp_vec);
+                                for (; index_c_in < index_c_in_end;
+                                     ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+                                {
+                                    const auto src_val = *(in_ptr_mover);
+                                    const auto w_val   = *(weights_ptr_mover);
+                                    out_temp += src_val * w_val;
+                                }
                             }
-                            out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                        }
-                        out_temp += vreduce(out_temp_vec);
-                        for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
-                        {
-                            const auto src_val = *(in_ptr_mover);
-                            const auto w_val   = *(weights_ptr_mover);
-                            out_temp += src_val * w_val;
                         }
                     }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr + id_w[0])) = (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp;
+                    *(reinterpret_cast<T *>(out_ptr + id_w[0])) =
+                        (biases_ptr != nullptr) ? out_temp + biases_ptr[id_w[0]] : out_temp;
+                },
+                wei);
         },
-        wei);
-    },
-    out);
+        out);
 }
 
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_LIST_H

diff --git a/src/cpu/kernels/conv3d/neon/quantized.h b/src/cpu/kernels/conv3d/neon/quantized.h
index a8165b4..f0fc9b5 100644
--- a/src/cpu/kernels/conv3d/neon/quantized.h
+++ b/src/cpu/kernels/conv3d/neon/quantized.h

@@ -28,16 +28,22 @@
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void directconv3d_quantized_neon_ndhwc(const ITensor *src0, const ITensor *src1, const ITensor *src2, ITensor *dst, const Conv3dInfo &conv_info, const Window &window)
+void directconv3d_quantized_neon_ndhwc(const ITensor    *src0,
+                                       const ITensor    *src1,
+                                       const ITensor    *src2,
+                                       ITensor          *dst,
+                                       const Conv3dInfo &conv_info,
+                                       const Window     &window)
 {
     const ITensor *src     = src0;
     const ITensor *weights = src1;
@@ -104,153 +110,166 @@
     Iterator wei(weights, window_w);
 
     const int32_t *biases_ptr = nullptr;
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_ptr = reinterpret_cast<int32_t *>(biases->buffer() + biases->info()->offset_first_element_in_bytes());
     }
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical input starting points
-        const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-        const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-        const int in_d_end_t   = in_d_start_t + kernel_dim_d;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_d_start = std::max(in_d_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-        const int in_d_end   = std::min(in_d_end_t, input_dim_d);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_d_start = in_d_start - in_d_start_t;
-        const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-        const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
-
-        const int      index_c_out_end = weights->info()->dimension(0);
-        const int      index_c_in_end  = weights->info()->dimension(1);
-        const T *const in_ptr_start    = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[4] * input_stride_n;
-
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            /*
+            // We are computing the theoretical input starting points
+            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+            const int in_d_start_t = static_cast<int>(id[3]) * conv_stride_d - conv_pad_front;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+            const int in_d_end_t   = in_d_start_t + kernel_dim_d;
+
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_d_start = std::max(in_d_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+            const int in_d_end   = std::min(in_d_end_t, input_dim_d);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_d_start = in_d_start - in_d_start_t;
+            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+            const int wei_d_end   = kernel_dim_d - (in_d_end_t - in_d_end);
+
+            const int      index_c_out_end = weights->info()->dimension(0);
+            const int      index_c_in_end  = weights->info()->dimension(1);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[4] * input_stride_n;
+
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
+                {
+                    /*
             * This is the loop in the weights, and it goes along OFM (output feature map)
             */
-            const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            int32_t    acc               = static_cast<int32_t>(0);
-            T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
-            for(int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end; ++index_wei_d, ++index_in_d)
-            {
-                const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
-                const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
-                    const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
-                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                    const auto weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    int32_t    acc               = static_cast<int32_t>(0);
+                    T         *out_ptr           = reinterpret_cast<T *>(out.ptr());
+                    for (int index_wei_d = wei_d_start, index_in_d = in_d_start; index_wei_d < wei_d_end;
+                         ++index_wei_d, ++index_in_d)
                     {
-                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                        int         index_c_in        = 0;
-                        vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
-
-                        q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-                        q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-                        q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-                        q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
-
-                        for(; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
-                            index_c_in += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                        const auto in_ptr_d      = in_ptr_start + index_in_d * input_stride_d;
+                        const auto weights_ptr_d = weights_ptr_start + index_wei_d * kernel_stride_d;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
                         {
-                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                            //Load Cin weights
-                            for(int k = 0; k < num_elems_read_per_iteration; ++k, weights_ptr_mover += index_c_out_end)
+                            const T *const in_ptr_row      = in_ptr_d + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_d + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
                             {
-                                w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
-                            }
-                            q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
-                            q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
-                            q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
-                            q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c_in        = 0;
+                                vector_type w_vec             = wrapper::vdup_n(static_cast<T>(0), tag_type());
 
-                            q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
-                            q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
-                            q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
-                            q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                q32x4_t acc_q32_0 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_1 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_2 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
+                                q32x4_t acc_q32_3 = wrapper::vdup_n(static_cast<q32_t>(0), tag_type());
 
-                            const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec));
-                            const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec));
-                            const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec));
-                            const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec));
+                                for (; index_c_in <= index_c_in_end - num_elems_read_per_iteration;
+                                     index_c_in += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    //Load Cin weights
+                                    for (int k = 0; k < num_elems_read_per_iteration;
+                                         ++k, weights_ptr_mover += index_c_out_end)
+                                    {
+                                        w_vec = wrapper::vsetlane(*weights_ptr_mover, w_vec, k);
+                                    }
+                                    q32x4_t src_q32_0 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_1 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_2 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
+                                    q32x4_t src_q32_3 = wrapper::vdup_n(static_cast<q32_t>(input_offset), tag_type());
 
-                            src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0)));
-                            src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0)));
-                            src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1)));
-                            src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1)));
+                                    q32x4_t wei_q32_0 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_1 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_2 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
+                                    q32x4_t wei_q32_3 = wrapper::vdup_n(static_cast<q32_t>(weights_offset), tag_type());
 
-                            wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0)));
-                            wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0)));
-                            wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1)));
-                            wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1)));
+                                    const auto src_q16_0 = wrapper::vmovl(wrapper::vgetlow(src_vec));
+                                    const auto src_q16_1 = wrapper::vmovl(wrapper::vgethigh(src_vec));
+                                    const auto wei_q16_0 = wrapper::vmovl(wrapper::vgetlow(w_vec));
+                                    const auto wei_q16_1 = wrapper::vmovl(wrapper::vgethigh(w_vec));
 
-                            acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0);
-                            acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1);
-                            acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2);
-                            acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3);
-                        }
+                                    src_q32_0 = wrapper::vadd(src_q32_0, wrapper::vmovl(wrapper::vgetlow(src_q16_0)));
+                                    src_q32_1 = wrapper::vadd(src_q32_1, wrapper::vmovl(wrapper::vgethigh(src_q16_0)));
+                                    src_q32_2 = wrapper::vadd(src_q32_2, wrapper::vmovl(wrapper::vgetlow(src_q16_1)));
+                                    src_q32_3 = wrapper::vadd(src_q32_3, wrapper::vmovl(wrapper::vgethigh(src_q16_1)));
+
+                                    wei_q32_0 = wrapper::vadd(wei_q32_0, wrapper::vmovl(wrapper::vgetlow(wei_q16_0)));
+                                    wei_q32_1 = wrapper::vadd(wei_q32_1, wrapper::vmovl(wrapper::vgethigh(wei_q16_0)));
+                                    wei_q32_2 = wrapper::vadd(wei_q32_2, wrapper::vmovl(wrapper::vgetlow(wei_q16_1)));
+                                    wei_q32_3 = wrapper::vadd(wei_q32_3, wrapper::vmovl(wrapper::vgethigh(wei_q16_1)));
+
+                                    acc_q32_0 = wrapper::vmla(acc_q32_0, wei_q32_0, src_q32_0);
+                                    acc_q32_1 = wrapper::vmla(acc_q32_1, wei_q32_1, src_q32_1);
+                                    acc_q32_2 = wrapper::vmla(acc_q32_2, wei_q32_2, src_q32_2);
+                                    acc_q32_3 = wrapper::vmla(acc_q32_3, wei_q32_3, src_q32_3);
+                                }
 #if defined(__aarch64__)
-                        acc += wrapper::vaddv(acc_q32_0);
-                        acc += wrapper::vaddv(acc_q32_1);
-                        acc += wrapper::vaddv(acc_q32_2);
-                        acc += wrapper::vaddv(acc_q32_3);
+                                acc += wrapper::vaddv(acc_q32_0);
+                                acc += wrapper::vaddv(acc_q32_1);
+                                acc += wrapper::vaddv(acc_q32_2);
+                                acc += wrapper::vaddv(acc_q32_3);
 #else // __aarch64__
-                        auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                auto temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_0), wrapper::vgetlow(acc_q32_0));
+                                temp      = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
-                        temp      = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_1), wrapper::vgetlow(acc_q32_1));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
-                        temp      = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_2), wrapper::vgetlow(acc_q32_2));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
-                        temp      = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3));
-                        temp      = wrapper::vpadd(temp, temp);
-                        acc       += wrapper::vgetlane(temp, 0);
+                                temp = wrapper::vpadd(wrapper::vgethigh(acc_q32_3), wrapper::vgetlow(acc_q32_3));
+                                temp = wrapper::vpadd(temp, temp);
+                                acc += wrapper::vgetlane(temp, 0);
 
 #endif // __aarch64__
 
-                        for(; index_c_in < index_c_in_end; ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
-                        {
-                            const auto src_val = *(in_ptr_mover) + input_offset;
-                            const auto w_val   = *(weights_ptr_mover) + weights_offset;
-                            acc += src_val * w_val;
+                                for (; index_c_in < index_c_in_end;
+                                     ++index_c_in, ++in_ptr_mover, weights_ptr_mover += index_c_out_end)
+                                {
+                                    const auto src_val = *(in_ptr_mover) + input_offset;
+                                    const auto w_val   = *(weights_ptr_mover) + weights_offset;
+                                    acc += src_val * w_val;
+                                }
+                            }
                         }
                     }
-                }
-            }
 
-            if(biases)
-            {
-                acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]);
-            }
+                    if (biases)
+                    {
+                        acc += *reinterpret_cast<const int32_t *>(biases_ptr + id_w[0]);
+                    }
 
-            T out_val                                   = finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false);
-            *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val;
+                    T out_val =
+                        finalize_quantization(acc, output_multiplier, output_shift, output_offset, T(0), T(0), false);
+                    *(reinterpret_cast<T *>(out_ptr + id_w[0])) = out_val;
+                },
+                wei);
         },
-        wei);
-    },
-    out);
+        out);
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_CONV3D_QUANTIZED_H

diff --git a/src/cpu/kernels/crop/generic/neon/crop_helper.h b/src/cpu/kernels/crop/generic/neon/crop_helper.h
index 1fe8e11..8fb7ad2 100644
--- a/src/cpu/kernels/crop/generic/neon/crop_helper.h
+++ b/src/cpu/kernels/crop/generic/neon/crop_helper.h

@@ -80,7 +80,7 @@
 {
     return vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(wrapper::vload(ptr)))));
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 
-#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H
\ No newline at end of file
+#endif //SRC_CORE_NEON_KERNELS_CROP_CROP_HELPER_H

diff --git a/src/cpu/kernels/crop/generic/neon/fp16.cpp b/src/cpu/kernels/crop/generic/neon/fp16.cpp
index 218ebba..3739c9d 100644
--- a/src/cpu/kernels/crop/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/crop/generic/neon/fp16.cpp

@@ -29,12 +29,19 @@
 {
 namespace cpu
 {
-void fp16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void fp16_in_bounds_crop_window(const ITensor *input,
+                                const ITensor *output,
+                                float         *output_ptr,
+                                Coordinates    input_offset,
+                                int32_t        window_step_x,
+                                int32_t        output_width_start,
+                                int32_t        output_width_limit,
+                                bool           input_has_single_channel,
+                                bool           is_width_flipped)
 {
-    return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset,
-                                            window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<float16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                            output_width_limit, input_has_single_channel, is_width_flipped);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/crop/generic/neon/fp32.cpp b/src/cpu/kernels/crop/generic/neon/fp32.cpp
index 16d0218..f665c36 100644
--- a/src/cpu/kernels/crop/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/crop/generic/neon/fp32.cpp

@@ -28,11 +28,18 @@
 {
 namespace cpu
 {
-void fp32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                                int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void fp32_in_bounds_crop_window(const ITensor *input,
+                                const ITensor *output,
+                                float         *output_ptr,
+                                Coordinates    input_offset,
+                                int32_t        window_step_x,
+                                int32_t        output_width_start,
+                                int32_t        output_width_limit,
+                                bool           input_has_single_channel,
+                                bool           is_width_flipped)
 {
-    return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset,
-                                            window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<float32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                            output_width_limit, input_has_single_channel, is_width_flipped);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/crop/generic/neon/impl.h b/src/cpu/kernels/crop/generic/neon/impl.h
index a59588b..b90ba9d 100644
--- a/src/cpu/kernels/crop/generic/neon/impl.h
+++ b/src/cpu/kernels/crop/generic/neon/impl.h

@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/crop/generic/neon/crop_helper.h"
 
 namespace arm_compute
@@ -35,19 +36,26 @@
 namespace cpu
 {
 template <typename T>
-void in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                           int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void in_bounds_crop_window(const ITensor *input,
+                           const ITensor *output,
+                           float         *output_ptr,
+                           Coordinates    input_offset,
+                           int32_t        window_step_x,
+                           int32_t        output_width_start,
+                           int32_t        output_width_limit,
+                           bool           input_has_single_channel,
+                           bool           is_width_flipped)
 {
     // Reverse elements if width flipped.
-    if(is_width_flipped)
+    if (is_width_flipped)
     {
         // Collapse first dimension if possible.
-        if(input_has_single_channel)
+        if (input_has_single_channel)
         {
             int32_t     x = output_width_start;
             Coordinates negative_offset(input_offset);
             negative_offset.set(1, negative_offset[1] - window_step_x + 1);
-            for(; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
+            for (; x <= output_width_limit - window_step_x; x += window_step_x, negative_offset[1] -= window_step_x)
             {
                 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(negative_offset)));
 
@@ -57,25 +65,27 @@
                 wrapper::vstore(output_ptr + x, in);
             }
             input_offset[1] = negative_offset[1] + window_step_x - 1;
-            for(; x < output_width_limit; ++x, --input_offset[1])
+            for (; x < output_width_limit; ++x, --input_offset[1])
             {
                 *(output_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
             }
         }
         else
         {
-            for(int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
+            for (int32_t x = output_width_start; x < output_width_limit; ++x, --input_offset[1])
             {
                 input_offset.set(0, 0);
                 int32_t c = 0;
-                for(; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x; c += window_step_x, input_offset[0] += window_step_x)
+                for (; c <= static_cast<int32_t>(input->info()->dimension(0)) - window_step_x;
+                     c += window_step_x, input_offset[0] += window_step_x)
                 {
                     auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
                     wrapper::vstore(output_ptr + x * output->info()->dimension(0) + c, in);
                 }
-                for(; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
+                for (; c < static_cast<int32_t>(input->info()->dimension(0)); ++c, ++input_offset[0])
                 {
-                    *(output_ptr + x * output->info()->dimension(0) + c) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                    *(output_ptr + x * output->info()->dimension(0) + c) =
+                        static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
                 }
             }
         }
@@ -83,25 +93,28 @@
     else
     {
         // Use memcpy if the elements don't need converting to float.
-        if(std::is_same<T, float>::value)
+        if (std::is_same<T, float>::value)
         {
             memcpy(static_cast<void *>(output_ptr + output_width_start * output->info()->dimension(0)),
                    reinterpret_cast<const void *>(input->ptr_to_element(input_offset)),
-                   (output_width_limit - output_width_start) * output->info()->dimension(0) * output->info()->element_size());
+                   (output_width_limit - output_width_start) * output->info()->dimension(0) *
+                       output->info()->element_size());
         }
         else
         {
-            int32_t x                = 0;
-            int32_t limit            = (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
+            int32_t x = 0;
+            int32_t limit =
+                (output_width_limit - output_width_start) * static_cast<int32_t>(output->info()->dimension(0));
             float *output_start_ptr = output_ptr + output_width_start * output->info()->dimension(0);
-            for(; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
+            for (; x <= limit - window_step_x; x += window_step_x, input_offset[0] += window_step_x)
             {
                 auto in = load_as_f32(reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
                 wrapper::vstore(output_start_ptr + x, in);
             }
-            for(; x < limit; ++x, ++input_offset[0])
+            for (; x < limit; ++x, ++input_offset[0])
             {
-                *(output_start_ptr + x) = static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
+                *(output_start_ptr + x) =
+                    static_cast<float>(*reinterpret_cast<T *>(input->ptr_to_element(input_offset)));
             }
         }
     }

diff --git a/src/cpu/kernels/crop/generic/neon/integer.cpp b/src/cpu/kernels/crop/generic/neon/integer.cpp
index ebf2c1f..602434f 100644
--- a/src/cpu/kernels/crop/generic/neon/integer.cpp
+++ b/src/cpu/kernels/crop/generic/neon/integer.cpp

@@ -29,46 +29,88 @@
 {
 namespace cpu
 {
-void u8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u8_in_bounds_crop_window(const ITensor *input,
+                              const ITensor *output,
+                              float         *output_ptr,
+                              Coordinates    input_offset,
+                              int32_t        window_step_x,
+                              int32_t        output_width_start,
+                              int32_t        output_width_limit,
+                              bool           input_has_single_channel,
+                              bool           is_width_flipped)
 {
-    return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset,
-                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<uint8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void u16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u16_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset,
-                                           window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<uint16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                           output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void u32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void u32_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset,
-                                           window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<uint32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                           output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void s8_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                              int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s8_in_bounds_crop_window(const ITensor *input,
+                              const ITensor *output,
+                              float         *output_ptr,
+                              Coordinates    input_offset,
+                              int32_t        window_step_x,
+                              int32_t        output_width_start,
+                              int32_t        output_width_limit,
+                              bool           input_has_single_channel,
+                              bool           is_width_flipped)
 {
-    return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset,
-                                         window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<int8_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                         output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void s16_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s16_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset,
-                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<int16_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
 }
 
-void s32_in_bounds_crop_window(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset,
-                               int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+void s32_in_bounds_crop_window(const ITensor *input,
+                               const ITensor *output,
+                               float         *output_ptr,
+                               Coordinates    input_offset,
+                               int32_t        window_step_x,
+                               int32_t        output_width_start,
+                               int32_t        output_width_limit,
+                               bool           input_has_single_channel,
+                               bool           is_width_flipped)
 {
-    return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset,
-                                          window_step_x, output_width_start, output_width_limit, input_has_single_channel, is_width_flipped);
+    return in_bounds_crop_window<int32_t>(input, output, output_ptr, input_offset, window_step_x, output_width_start,
+                                          output_width_limit, input_has_single_channel, is_width_flipped);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/crop/list.h b/src/cpu/kernels/crop/list.h
index a6b8321..9cb7726 100644
--- a/src/cpu/kernels/crop/list.h
+++ b/src/cpu/kernels/crop/list.h

@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/crop/generic/neon/impl.h"
 
 namespace arm_compute
@@ -36,7 +37,8 @@
 {
 #define DECLARE_CROP_KERNEL(func_name)                                                                       \
     void func_name(const ITensor *input, const ITensor *output, float *output_ptr, Coordinates input_offset, \
-                   int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit, bool input_has_single_channel, bool is_width_flipped)
+                   int32_t window_step_x, int32_t output_width_start, int32_t output_width_limit,            \
+                   bool input_has_single_channel, bool is_width_flipped)
 
 DECLARE_CROP_KERNEL(fp16_in_bounds_crop_window);
 DECLARE_CROP_KERNEL(fp32_in_bounds_crop_window);

diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
index e85a166..293e606 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp16.cpp

@@ -29,11 +29,16 @@
 {
 namespace cpu
 {
-void neon_fp16_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                    ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_fp16_deptwiseconv2dnative(const ITensor         *src,
+                                    const ITensor         *weights,
+                                    const ITensor         *bias,
+                                    ITensor               *dst,
+                                    const Window          &window,
+                                    bool                   has_biases,
+                                    const ConvolutionInfo &info)
 {
     return run_depthwise_float<float16_t, float16_t>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC

diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
index b2333a3..c6fa479 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/fp32.cpp

@@ -26,10 +26,15 @@
 {
 namespace cpu
 {
-void neon_fp32_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                    ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_fp32_deptwiseconv2dnative(const ITensor         *src,
+                                    const ITensor         *weights,
+                                    const ITensor         *bias,
+                                    ITensor               *dst,
+                                    const Window          &window,
+                                    bool                   has_biases,
+                                    const ConvolutionInfo &info)
 {
     return run_depthwise_float<float, float>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
index a2ae556..d08e973 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h"
+
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/function_info/ConvolutionInfo.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -65,8 +67,16 @@
 namespace
 {
 template <typename T, typename TW>
-void depthwise_loop_multiplier1_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                          const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_multiplier1_quantized(const ITensor       *src,
+                                          const ITensor       *weights,
+                                          const ITensor       *biases,
+                                          ITensor             *dst,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          std::vector<int>     output_multiplier,
+                                          std::vector<int>     output_shift,
+                                          const Window        &window,
+                                          bool                 has_biases) // NOLINT
 {
     ARM_COMPUTE_UNUSED(output_multiplier, output_shift);
     constexpr auto element_per_vector = vector_size / sizeof(T);
@@ -75,7 +85,8 @@
     using AccType                     = int32_t;
     using AccArrayType                = std::array<AccType, element_per_vector>;
 
-    const auto out_of_bound_value  = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value =
+        PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
     const auto out_of_bound_vector = wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
 
     const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window);
@@ -104,152 +115,175 @@
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-        auto const    base_weights_ptr  = weights_it.ptr();
-        size_t        x                 = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
         {
-            AccArrayType acc{};
-            AccArrayType in_sum{};
-            AccArrayType we_sum{};
+            const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+            auto const    base_weights_ptr  = weights_it.ptr();
+            size_t        x                 = run_info.x_start;
 
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
+            for (; x < run_info.x_leftover_start; x += run_info.x_step)
             {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 out_of_bound_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                AccArrayType acc{};
+                AccArrayType in_sum{};
+                AccArrayType we_sum{};
 
-                    for(size_t i = 0; i < element_per_vector; ++i)
+                auto weights_ptr  = base_weights_ptr;
+                auto input_offset = base_input_offset;
+
+                for (size_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
                     {
-                        acc.at(i) += input_vals[i] * weights_vals[i];
-                        in_sum.at(i) += input_vals[i];
-                        we_sum.at(i) += weights_vals[i];
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? wrapper::vload(reinterpret_cast<T *>(
+                                      input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+                                : out_of_bound_vector;
+                        const auto weights_vals =
+                            wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        for (size_t i = 0; i < element_per_vector; ++i)
+                        {
+                            acc.at(i) += input_vals[i] * weights_vals[i];
+                            in_sum.at(i) += input_vals[i];
+                            we_sum.at(i) += weights_vals[i];
+                        }
+
+                        offs += dilation.x() * run_info.input_stride_y;
                     }
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
+                VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
+                for (size_t i = 0; i < element_per_vector; ++i)
+                {
+                    acc.at(i) -= in_sum.at(i) * weights_qoffset;
+                    acc.at(i) -= we_sum.at(i) * input_qoffset;
+                    acc.at(i) += k_offset;
+
+                    if (has_biases)
+                    {
+                        acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+                    }
+
+                    const int32_t out_mul   = output_multiplier.at(x + i);
+                    const int32_t out_shift = output_shift.at(x + i);
+                    if (out_shift < 0)
+                    {
+                        acc.at(i) =
+                            saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                    }
+                    else
+                    {
+                        acc.at(i) =
+                            rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) +
+                            output_qoffset;
+                    }
+                    out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
+                }
+
+                wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
             }
 
-            VectorType out_vals = wrapper::vdup_n(static_cast<T>(0), TagType{});
-            for(size_t i = 0; i < element_per_vector; ++i)
+            // left-over
+            for (; x < run_info.x_end; ++x)
             {
-                acc.at(i) -= in_sum.at(i) * weights_qoffset;
-                acc.at(i) -= we_sum.at(i) * input_qoffset;
-                acc.at(i) += k_offset;
+                AccType acc    = 0;
+                AccType in_sum = 0;
+                AccType we_sum = 0;
 
-                if(has_biases)
+                auto weights_ptr  = base_weights_ptr;
+                auto input_offset = base_input_offset;
+
+                for (size_t h = 0; h < run_info.weights_height; ++h)
                 {
-                    acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i * sizeof(int32_t)) + x);
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_val =
+                            is_valid_region
+                                ? *reinterpret_cast<T *>(input_it.ptr() +
+                                                         std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+                                : out_of_bound_value;
+                        const auto weights_val =
+                            *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        acc += input_val * weights_val;
+                        in_sum += input_val;
+                        we_sum += weights_val;
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
                 }
 
-                const int32_t out_mul   = output_multiplier.at(x + i);
-                const int32_t out_shift = output_shift.at(x + i);
-                if(out_shift < 0)
+                T out_vals{0};
+
+                acc -= in_sum * weights_qoffset;
+                acc -= we_sum * input_qoffset;
+                acc += k_offset;
+
+                if (has_biases)
                 {
-                    acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                    acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
+                }
+
+                const int32_t out_mul   = output_multiplier.at(x);
+                const int32_t out_shift = output_shift.at(x);
+
+                if (out_shift < 0)
+                {
+                    acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
                 }
                 else
                 {
-                    acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
-                }
-                out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, out_vals);
-        }
-
-        // left-over
-        for(; x < run_info.x_end; ++x)
-        {
-            AccType acc    = 0;
-            AccType in_sum = 0;
-            AccType we_sum = 0;
-
-            auto weights_ptr  = base_weights_ptr;
-            auto input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_val       = is_valid_region ?
-                                                 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
-                                                 out_of_bound_value;
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc += input_val * weights_val;
-                    in_sum += input_val;
-                    we_sum += weights_val;
-
-                    offs += dilation.x() * run_info.input_stride_y;
+                    acc =
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
+                out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
+                *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
             }
-
-            T out_vals{ 0 };
-
-            acc -= in_sum * weights_qoffset;
-            acc -= we_sum * input_qoffset;
-            acc += k_offset;
-
-            if(has_biases)
-            {
-                acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
-            }
-
-            const int32_t out_mul   = output_multiplier.at(x);
-            const int32_t out_shift = output_shift.at(x);
-
-            if(out_shift < 0)
-            {
-                acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
-            }
-            else
-            {
-                acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
-            }
-
-            out_vals                                      = static_cast<T>(utility::clamp<AccType, T>(acc));
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T, typename TW>
-void depthwise_loop_generic_quantized(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                      const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_generic_quantized(const ITensor       *src,
+                                      const ITensor       *weights,
+                                      const ITensor       *biases,
+                                      ITensor             *dst,
+                                      const PadStrideInfo &conv_info,
+                                      const Size2D        &dilation,
+                                      unsigned int         depth_multiplier,
+                                      std::vector<int>     output_multiplier,
+                                      std::vector<int>     output_shift,
+                                      const Window        &window,
+                                      bool                 has_biases) // NOLINT
 {
     using AccType = int32_t;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
+    const auto out_of_bound_value =
+        PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
 
     const int32_t input_qoffset   = src->info()->quantization_info().uniform().offset;
     const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
@@ -277,76 +311,93 @@
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<AccType> acc(depth_multiplier, 0);
-        std::vector<AccType> we_sum(depth_multiplier, 0);
-        AccType              in_sum = 0;
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
         {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
+            std::vector<AccType> acc(depth_multiplier, 0);
+            std::vector<AccType> we_sum(depth_multiplier, 0);
+            AccType              in_sum = 0;
+
+            const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
             {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
-
-                for(size_t m = 0; m < depth_multiplier; ++m)
+                int offs = input_offset;
+                for (size_t w = 0; w < run_info.weights_width; ++w)
                 {
-                    const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m) += input_val * weights_val;
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val =
+                        is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+                                                                                            run_info.input_max_offset)))
+                                        : out_of_bound_value;
 
-                    we_sum.at(m) += weights_val;
+                    for (size_t m = 0; m < depth_multiplier; ++m)
+                    {
+                        const auto weights_val =
+                            *(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                        acc.at(m) += input_val * weights_val;
+
+                        we_sum.at(m) += weights_val;
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
+                    in_sum += input_val;
                 }
 
-                offs += dilation.x() * run_info.input_stride_y;
-                in_sum += input_val;
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
             }
 
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0; m < depth_multiplier; ++m)
-        {
-            acc.at(m) -= in_sum * weights_qoffset;
-            acc.at(m) -= we_sum.at(m) * input_qoffset;
-            acc.at(m) += k_offset;
-
-            if(has_biases)
+            for (size_t m = 0; m < depth_multiplier; ++m)
             {
-                acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-            }
+                acc.at(m) -= in_sum * weights_qoffset;
+                acc.at(m) -= we_sum.at(m) * input_qoffset;
+                acc.at(m) += k_offset;
 
-            const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
-            const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
-            if(out_shift < 0)
-            {
-                acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                if (has_biases)
+                {
+                    acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                }
+
+                const int32_t out_mul   = output_multiplier.at(id.x() * depth_multiplier + m);
+                const int32_t out_shift = output_shift.at(id.x() * depth_multiplier + m);
+                if (out_shift < 0)
+                {
+                    acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
+                }
+                else
+                {
+                    acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) +
+                                output_qoffset;
+                }
+                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) =
+                    static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
             }
-            else
-            {
-                acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
-            }
-            *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T, typename TW>
-void depthwise_loop_pow2_quantized_per_tensor(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                              const Size2D &dilation, unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift, const Window &window, bool has_biases) // NOLINT
+void depthwise_loop_pow2_quantized_per_tensor(const ITensor       *src,
+                                              const ITensor       *weights,
+                                              const ITensor       *biases,
+                                              ITensor             *dst,
+                                              const PadStrideInfo &conv_info,
+                                              const Size2D        &dilation,
+                                              unsigned int         depth_multiplier,
+                                              std::vector<int>     output_multiplier,
+                                              std::vector<int>     output_shift,
+                                              const Window        &window,
+                                              bool                 has_biases) // NOLINT
 {
     constexpr int half_vec = vector_size / 2;
 
@@ -355,11 +406,15 @@
     using AccVectorTagType = typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
     using TagType          = typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
 
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
-    const auto input_qoffset_vec   = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
-    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
-    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset, arm_compute::wrapper::traits::vector_128_tag{});
+    const auto input_qoffset_vec = wrapper::vreinterpret(
+        wrapper::vmovl(wrapper::vdup_n(static_cast<T>(src->info()->quantization_info().uniform().offset), TagType{})));
+    const auto weights_qoffset_vec = wrapper::vreinterpret(wrapper::vmovl(
+        wrapper::vdup_n(static_cast<TW>(weights->info()->quantization_info().uniform().offset), TagType{})));
+    const auto output_qoffset_vec  = wrapper::vdup_n(dst->info()->quantization_info().uniform().offset,
+                                                     arm_compute::wrapper::traits::vector_128_tag{});
 
     const auto lower = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::lowest()), AccVectorTagType{});
     const auto upper = wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
@@ -389,7 +444,7 @@
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
@@ -397,95 +452,117 @@
     std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
     std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::fill(begin(acc0), end(acc0), zero);
-        std::fill(begin(acc1), end(acc1), zero);
-
-        const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
         {
-            const int32_t current_h = input_z + h * dilation.y();
-            if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
+            std::fill(begin(acc0), end(acc0), zero);
+            std::fill(begin(acc1), end(acc1), zero);
+
+            const int32_t input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int64_t       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
             {
-                int offs = input_offset;
-                for(size_t w = 0; w < run_info.weights_width; ++w)
+                const int32_t current_h = input_z + h * dilation.y();
+                if (current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
                 {
-                    const int32_t current_w = input_y + w * dilation.x();
-                    if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
+                    int offs = input_offset;
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
                     {
-                        const auto input_8x8     = wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
-                        const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
-                        const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
-
-                        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+                        const int32_t current_w = input_y + w * dilation.x();
+                        if (current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
                         {
-                            const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                            const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
-                            const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
+                            const auto input_8x8 = wrapper::vdup_n(
+                                *(reinterpret_cast<T *>(
+                                    input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))),
+                                TagType{});
+                            const auto input_s16x8   = wrapper::vreinterpret(wrapper::vmovl(input_8x8));
+                            const auto input_no_offs = wrapper::vsub(input_s16x8, input_qoffset_vec);
 
-                            acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs), wrapper::vgetlow(weights_no_offs));
-                            acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs), wrapper::vgethigh(weights_no_offs));
+                            for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+                            {
+                                const auto weights_8x8     = wrapper::vload(reinterpret_cast<TW *>(
+                                    weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                                const auto weights_s16x8   = wrapper::vreinterpret(wrapper::vmovl(weights_8x8));
+                                const auto weights_no_offs = wrapper::vsub(weights_s16x8, weights_qoffset_vec);
+
+                                acc0.at(i) = wrapper::vmlal(acc0.at(i), wrapper::vgetlow(input_no_offs),
+                                                            wrapper::vgetlow(weights_no_offs));
+                                acc1.at(i) = wrapper::vmlal(acc1.at(i), wrapper::vgethigh(input_no_offs),
+                                                            wrapper::vgethigh(weights_no_offs));
+                            }
                         }
-                    }
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+                }
+
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
+            }
+
+            for (size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
+            {
+                if (has_biases)
+                {
+                    const auto bias_val0 =
+                        wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
+                    const auto bias_val1 = wrapper::vloadq(
+                        reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
+
+                    acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
+                    acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
+                }
+
+                if (out_shift < 0)
+                {
+                    acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul),
+                                               output_qoffset_vec);
+                    acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul),
+                                               output_qoffset_vec);
+                }
+                else
+                {
+                    acc0.at(i) = wrapper::vadd(
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift),
+                        output_qoffset_vec);
+                    acc1.at(i) = wrapper::vadd(
+                        rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift),
+                        output_qoffset_vec);
+                }
+
+                acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
+                acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
+
+                const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)), wrapper::vmovn(acc1.at(i)));
+
+                if (std::is_same<T, uint8_t>::value)
+                {
+                    wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)),
+                                    wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
+                }
+                else
+                {
+                    wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)),
+                                    wrapper::vqmovn(out_val));
                 }
             }
-
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        for(size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
-        {
-            if(has_biases)
-            {
-                const auto bias_val0 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m * sizeof(int32_t)));
-                const auto bias_val1 = wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) * sizeof(int32_t)));
-
-                acc0.at(i) = wrapper::vadd(acc0.at(i), bias_val0);
-                acc1.at(i) = wrapper::vadd(acc1.at(i), bias_val1);
-            }
-
-            if(out_shift < 0)
-            {
-                acc0.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
-            }
-            else
-            {
-                acc0.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
-                acc1.at(i) = wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
-            }
-
-            acc0.at(i) = wrapper::vmin(wrapper::vmax(acc0.at(i), lower), upper);
-            acc1.at(i) = wrapper::vmin(wrapper::vmax(acc1.at(i), lower), upper);
-
-            const auto out_val = wrapper::vcombine(wrapper::vmovn(acc0.at(i)),
-                                                   wrapper::vmovn(acc1.at(i)));
-
-            if(std::is_same<T, uint8_t>::value)
-            {
-                wrapper::vstore(reinterpret_cast<uint8_t *>(output_it.ptr() + m * sizeof(uint8_t)), wrapper::vqmovn(vreinterpretq_u16_s16(out_val)));
-            }
-            else
-            {
-                wrapper::vstore(reinterpret_cast<int8_t *>(output_it.ptr() + m * sizeof(int8_t)), wrapper::vqmovn(out_val));
-            }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 } // namespace
 
 template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void run_depthwise_quanitized8bit(const ITensor         *src,
+                                  const ITensor         *weights,
+                                  const ITensor         *biases,
+                                  ITensor               *dst,
+                                  const Window          &window,
+                                  bool                   has_biases,
+                                  const ConvolutionInfo &info)
 {
     PadStrideInfo    conv_info        = info.pad_stride_info;
     unsigned int     depth_multiplier = info.depth_multiplier;
@@ -497,15 +574,15 @@
     const auto output_scale  = dst->info()->quantization_info().uniform().scale;
     auto       weights_scale = weights->info()->quantization_info().scale();
 
-    if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
+    if (!is_data_type_quantized_per_channel(weights->info()->data_type()))
     {
-        for(size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
+        for (size_t i = 1; i < weights->info()->dimension(channel_idx); ++i)
         {
             weights_scale.push_back(weights_scale.front());
         }
     }
 
-    for(const auto &s : weights_scale)
+    for (const auto &s : weights_scale)
     {
         int32_t     out_mult   = 0;
         int32_t     out_shift  = 0;
@@ -516,30 +593,49 @@
         output_shift.push_back(out_shift);
     }
 
-    if(depth_multiplier == 1)
+    if (depth_multiplier == 1)
     {
-        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier, output_shift, window, has_biases);
+        depthwise_loop_multiplier1_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, output_multiplier,
+                                                    output_shift, window, has_biases);
     }
     else
     {
         const bool is_pow2                 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
         const bool is_quantized_per_tensor = !(is_data_type_quantized_per_channel(weights->info()->data_type()));
 
-        if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
+        if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
         {
-            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+            depthwise_loop_pow2_quantized_per_tensor<T, TW>(src, weights, biases, dst, conv_info, dilation,
+                                                            depth_multiplier, output_multiplier, output_shift, window,
+                                                            has_biases);
         }
         else
         {
-            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
+            depthwise_loop_generic_quantized<T, TW>(src, weights, biases, dst, conv_info, dilation, depth_multiplier,
+                                                    output_multiplier, output_shift, window, has_biases);
         }
     }
 }
-template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                             ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                           ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
-template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                                            ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, uint8_t>(const ITensor         *src,
+                                                             const ITensor         *weights,
+                                                             const ITensor         *biases,
+                                                             ITensor               *dst,
+                                                             const Window          &window,
+                                                             bool                   has_biases,
+                                                             const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<int8_t, int8_t>(const ITensor         *src,
+                                                           const ITensor         *weights,
+                                                           const ITensor         *biases,
+                                                           ITensor               *dst,
+                                                           const Window          &window,
+                                                           bool                   has_biases,
+                                                           const ConvolutionInfo &info);
+template void run_depthwise_quanitized8bit<uint8_t, int8_t>(const ITensor         *src,
+                                                            const ITensor         *weights,
+                                                            const ITensor         *biases,
+                                                            ITensor               *dst,
+                                                            const Window          &window,
+                                                            bool                   has_biases,
+                                                            const ConvolutionInfo &info);
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
index 8410cdb..3fa5c58 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/impl.h

@@ -24,6 +24,7 @@
 #ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
 #define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -63,15 +64,21 @@
     const size_t   input_width;
     const size_t   input_depth;
 
-    DepthwiseConvolutionRunInfo(const ITensorInfo &input, const ITensorInfo &weights, const PadStrideInfo &conv_info, const Window &w, uint32_t depth_multiplier = 1) // NOLINT
-        : num_read_elements_per_iteration((depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
+    DepthwiseConvolutionRunInfo(const ITensorInfo   &input,
+                                const ITensorInfo   &weights,
+                                const PadStrideInfo &conv_info,
+                                const Window        &w,
+                                uint32_t             depth_multiplier = 1) // NOLINT
+        : num_read_elements_per_iteration(
+              (depth_multiplier == 1 ? (vector_size / element_size_from_data_type(input.data_type())) : 1)),
           x_start(w.x().start()),
           x_end(w.x().end()),
           x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
           x_leftover_start(std::max(static_cast<int32_t>(w.x().end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
           input_stride_y(input.strides_in_bytes().y()),
           input_stride_z(input.strides_in_bytes().z()),
-          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
+          input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) -
+                           (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
           weights_width(weights.dimension(width_idx)),
           weights_height(weights.dimension(height_idx)),
           weights_stride_y(weights.strides_in_bytes().y()),
@@ -87,7 +94,12 @@
     }
 };
 
-inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t w, uint32_t h, const DepthwiseConvolutionRunInfo &run_info, const Size2D &dilation)
+inline bool is_valid_input_region(int32_t                            base_w,
+                                  uint32_t                           base_h,
+                                  uint32_t                           w,
+                                  uint32_t                           h,
+                                  const DepthwiseConvolutionRunInfo &run_info,
+                                  const Size2D                      &dilation)
 {
     const int32_t current_h  = base_h + h * dilation.y();
     const bool    is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
@@ -99,8 +111,14 @@
 }
 
 template <typename T>
-void depthwise_loop_multiplier1_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                                   const Size2D &dilation, const Window &window, bool has_biases)
+void depthwise_loop_multiplier1_fp(const ITensor       *src,
+                                   const ITensor       *weights,
+                                   const ITensor       *biases,
+                                   ITensor             *dst,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   const Window        &window,
+                                   bool                 has_biases)
 {
     constexpr auto element_per_vector = vector_size / sizeof(T);
     using VectorType                  = typename wrapper::traits::neon_vector<T, element_per_vector>::type;
@@ -129,94 +147,112 @@
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto const base_weights_ptr = weights_it.ptr();
-        uint32_t   x                = run_info.x_start;
-
-        for(; x < run_info.x_leftover_start; x += run_info.x_step)
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
         {
-            VectorType acc          = zero_vector;
-            auto       weights_ptr  = base_weights_ptr;
-            int64_t    input_offset = base_input_offset;
+            const int32_t input_y           = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int32_t input_z           = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
 
-            for(uint32_t h = 0; h < run_info.weights_height; ++h)
+            auto const base_weights_ptr = weights_it.ptr();
+            uint32_t   x                = run_info.x_start;
+
+            for (; x < run_info.x_leftover_start; x += run_info.x_step)
             {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(uint32_t w = 0; w < run_info.weights_width; ++w)
-                {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ?
-                                                 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
-                                                 zero_vector;
-                    const auto weights_vals = wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-                    acc                     = wrapper::vmla(acc, weights_vals, input_vals);
+                VectorType acc          = zero_vector;
+                auto       weights_ptr  = base_weights_ptr;
+                int64_t    input_offset = base_input_offset;
 
-                    offs += dilation.x() * run_info.input_stride_y;
+                for (uint32_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (uint32_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? wrapper::vload(reinterpret_cast<T *>(
+                                      input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)))
+                                : zero_vector;
+                        const auto weights_vals =
+                            wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+                        acc = wrapper::vmla(acc, weights_vals, input_vals);
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
-            }
-
-            if(has_biases)
-            {
-                const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc                    = wrapper::vadd(acc, biases_vals);
-            }
-
-            wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
-        }
-
-        for(; x < run_info.x_end; ++x)
-        {
-            auto    acc_scalar   = T{ 0 };
-            auto    weights_ptr  = base_weights_ptr;
-            int64_t input_offset = base_input_offset;
-
-            for(size_t h = 0; h < run_info.weights_height; ++h)
-            {
-                int64_t offs = input_offset + x * sizeof(T);
-                for(size_t w = 0; w < run_info.weights_width; ++w)
+                if (has_biases)
                 {
-                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                    const auto input_vals      = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
-                    const auto weights_vals    = *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
-
-                    acc_scalar += (input_vals * weights_vals);
-
-                    offs += dilation.x() * run_info.input_stride_y;
+                    const auto biases_vals = wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                    acc                    = wrapper::vadd(acc, biases_vals);
                 }
 
-                weights_ptr += run_info.weights_stride_z;
-                input_offset += dilation.y() * run_info.input_stride_z;
+                wrapper::vstore(reinterpret_cast<T *>(output_it.ptr()) + x, acc);
             }
 
-            if(has_biases)
+            for (; x < run_info.x_end; ++x)
             {
-                const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
-                acc_scalar += biases_vals;
+                auto    acc_scalar   = T{0};
+                auto    weights_ptr  = base_weights_ptr;
+                int64_t input_offset = base_input_offset;
+
+                for (size_t h = 0; h < run_info.weights_height; ++h)
+                {
+                    int64_t offs = input_offset + x * sizeof(T);
+                    for (size_t w = 0; w < run_info.weights_width; ++w)
+                    {
+                        const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                        const auto input_vals =
+                            is_valid_region
+                                ? *reinterpret_cast<T *>(input_it.ptr() +
+                                                         std::min(static_cast<size_t>(offs), run_info.input_max_offset))
+                                : 0;
+                        const auto weights_vals =
+                            *(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
+
+                        acc_scalar += (input_vals * weights_vals);
+
+                        offs += dilation.x() * run_info.input_stride_y;
+                    }
+
+                    weights_ptr += run_info.weights_stride_z;
+                    input_offset += dilation.y() * run_info.input_stride_z;
+                }
+
+                if (has_biases)
+                {
+                    const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
+                    acc_scalar += biases_vals;
+                }
+                *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
             }
-            *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T>
-void depthwise_loop_generic_fp(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const PadStrideInfo &conv_info,
-                               const Size2D &dilation, unsigned int depth_multiplier, const Window &window, bool has_biases)
+void depthwise_loop_generic_fp(const ITensor       *src,
+                               const ITensor       *weights,
+                               const ITensor       *biases,
+                               ITensor             *dst,
+                               const PadStrideInfo &conv_info,
+                               const Size2D        &dilation,
+                               unsigned int         depth_multiplier,
+                               const Window        &window,
+                               bool                 has_biases)
 {
-    const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
+    const auto run_info =
+        DepthwiseConvolutionRunInfo(*src->info(), *weights->info(), conv_info, window, depth_multiplier);
 
     Window execution_window = window;
     execution_window.set(Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
@@ -240,81 +276,98 @@
     Iterator output_it(dst, win_output);
     Iterator biases_it{};
 
-    if(has_biases)
+    if (has_biases)
     {
         biases_it = Iterator(biases, win_weights);
     }
 
-    execute_window_loop(execution_window, [&](const Coordinates & id)
-    {
-        std::vector<T> acc(depth_multiplier, static_cast<T>(0));
-
-        const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
-        const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
-        int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
-
-        auto weights_ptr = weights_it.ptr();
-        for(size_t h = 0; h < run_info.weights_height; ++h)
+    execute_window_loop(
+        execution_window,
+        [&](const Coordinates &id)
         {
-            int offs = input_offset;
-            for(size_t w = 0; w < run_info.weights_width; ++w)
-            {
-                const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
-                const auto input_val       = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
+            std::vector<T> acc(depth_multiplier, static_cast<T>(0));
 
-                for(size_t m = 0; m < depth_multiplier; ++m)
+            const int input_y      = id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
+            const int input_z      = id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
+            int       input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
+
+            auto weights_ptr = weights_it.ptr();
+            for (size_t h = 0; h < run_info.weights_height; ++h)
+            {
+                int offs = input_offset;
+                for (size_t w = 0; w < run_info.weights_width; ++w)
                 {
-                    const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
-                    acc.at(m)              = support::cpp11::fma(weights_val, input_val, acc.at(m));
+                    const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
+                    const auto input_val =
+                        is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs),
+                                                                                            run_info.input_max_offset)))
+                                        : T(0);
+
+                    for (size_t m = 0; m < depth_multiplier; ++m)
+                    {
+                        const auto weights_val =
+                            *(reinterpret_cast<T *>(weights_ptr + m * sizeof(T) + w * run_info.weights_stride_y));
+                        acc.at(m) = support::cpp11::fma(weights_val, input_val, acc.at(m));
+                    }
+
+                    offs += dilation.x() * run_info.input_stride_y;
                 }
 
-                offs += dilation.x() * run_info.input_stride_y;
+                weights_ptr += run_info.weights_stride_z;
+                input_offset += dilation.y() * run_info.input_stride_z;
             }
 
-            weights_ptr += run_info.weights_stride_z;
-            input_offset += dilation.y() * run_info.input_stride_z;
-        }
-
-        if(has_biases)
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
+            if (has_biases)
             {
-                const auto biases_val                                     = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+                for (size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m * sizeof(T)));
+                    *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m) + biases_val;
+                }
             }
-        }
-        else
-        {
-            for(size_t m = 0; m < depth_multiplier; ++m)
+            else
             {
-                *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+                for (size_t m = 0; m < depth_multiplier; ++m)
+                {
+                    *(reinterpret_cast<T *>(output_it.ptr() + m * sizeof(T))) = acc.at(m);
+                }
             }
-        }
-    },
-    input_it, weights_it, biases_it, output_it);
+        },
+        input_it, weights_it, biases_it, output_it);
 }
 
 template <typename T, typename TW>
-void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                         ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void run_depthwise_float(const ITensor         *src,
+                         const ITensor         *weights,
+                         const ITensor         *biases,
+                         ITensor               *dst,
+                         const Window          &window,
+                         bool                   has_biases,
+                         const ConvolutionInfo &info)
 {
     PadStrideInfo conv_info        = info.pad_stride_info;
     unsigned int  depth_multiplier = info.depth_multiplier;
     Size2D        dilation         = info.dilation;
 
-    if(depth_multiplier == 1)
+    if (depth_multiplier == 1)
     {
         depthwise_loop_multiplier1_fp<T>(src, weights, biases, dst, conv_info, dilation, window, has_biases);
     }
     else
     {
-        depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window, has_biases);
+        depthwise_loop_generic_fp<T>(src, weights, biases, dst, conv_info, dilation, depth_multiplier, window,
+                                     has_biases);
     }
 }
 
 template <typename T, typename TW>
-void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases,
-                                  ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info);
+void run_depthwise_quanitized8bit(const ITensor         *src,
+                                  const ITensor         *weights,
+                                  const ITensor         *biases,
+                                  ITensor               *dst,
+                                  const Window          &window,
+                                  bool                   has_biases,
+                                  const ConvolutionInfo &info);
 
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
index 1bf7ad7..d32847c 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8.cpp

@@ -26,16 +26,26 @@
 {
 namespace cpu
 {
-void neon_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                   ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qu8_deptwiseconv2dnative(const ITensor         *src,
+                                   const ITensor         *weights,
+                                   const ITensor         *bias,
+                                   ITensor               *dst,
+                                   const Window          &window,
+                                   bool                   has_biases,
+                                   const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<uint8_t, uint8_t>(src, weights, bias, dst, window, has_biases, info);
 }
 
-void neon_qp8_qu8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                       ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qp8_qu8_deptwiseconv2dnative(const ITensor         *src,
+                                       const ITensor         *weights,
+                                       const ITensor         *bias,
+                                       ITensor               *dst,
+                                       const Window          &window,
+                                       bool                   has_biases,
+                                       const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<uint8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
index 58f7536..682fad0 100644
--- a/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/depthwiseconv2d/generic/neon/qasymm8_signed.cpp

@@ -26,16 +26,26 @@
 {
 namespace cpu
 {
-void neon_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                   ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qs8_deptwiseconv2dnative(const ITensor         *src,
+                                   const ITensor         *weights,
+                                   const ITensor         *bias,
+                                   ITensor               *dst,
+                                   const Window          &window,
+                                   bool                   has_biases,
+                                   const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
 }
 
-void neon_qp8_qs8_deptwiseconv2dnative(const ITensor *src, const ITensor *weights, const ITensor *bias,
-                                       ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+void neon_qp8_qs8_deptwiseconv2dnative(const ITensor         *src,
+                                       const ITensor         *weights,
+                                       const ITensor         *bias,
+                                       ITensor               *dst,
+                                       const Window          &window,
+                                       bool                   has_biases,
+                                       const ConvolutionInfo &info)
 {
     return run_depthwise_quanitized8bit<int8_t, int8_t>(src, weights, bias, dst, window, has_biases, info);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/depthwiseconv2d/list.h b/src/cpu/kernels/depthwiseconv2d/list.h
index 44f055d..cf80608 100644
--- a/src/cpu/kernels/depthwiseconv2d/list.h
+++ b/src/cpu/kernels/depthwiseconv2d/list.h

@@ -27,9 +27,9 @@
 {
 namespace cpu
 {
-#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name)                                   \
-    void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, \
-                   ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
+#define DECLARE_DEPTHWISECONV2D_KERNEL(func_name)                                                 \
+    void func_name(const ITensor *src, const ITensor *weights, const ITensor *bias, ITensor *dst, \
+                   const Window &window, bool has_biases, const ConvolutionInfo &info)
 DECLARE_DEPTHWISECONV2D_KERNEL(neon_qu8_deptwiseconv2dnative);
 DECLARE_DEPTHWISECONV2D_KERNEL(neon_qs8_deptwiseconv2dnative);
 DECLARE_DEPTHWISECONV2D_KERNEL(neon_fp16_deptwiseconv2dnative);

diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h
index 9a04726..5cbf7a3 100644
--- a/src/cpu/kernels/directconv2d/list.h
+++ b/src/cpu/kernels/directconv2d/list.h

@@ -32,8 +32,9 @@
 {
 namespace kernels
 {
-#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \
-    void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+#define DECLARE_DIRECT_CONV2D_KERNEL(func_name)                                                    \
+    void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, \
+                   const PadStrideInfo &conv_info)
 
 DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d);
 DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d);

diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp
index a719fa5..218a4b7 100644
--- a/src/cpu/kernels/directconv2d/nchw/all.cpp
+++ b/src/cpu/kernels/directconv2d/nchw/all.cpp

@@ -22,18 +22,17 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
-
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
 
 #include <algorithm>
 
@@ -44,22 +43,26 @@
 namespace kernels
 {
 template <typename T>
-void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+void convolve_nchw(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-void neon_fp16_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp16_nchw_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     convolve_nchw<float16_t>(window, src, weights, dst, conv_info);
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void neon_fp32_nchw_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp32_nchw_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     convolve_nchw<float>(window, src, weights, dst, conv_info);
 }
 
 template <typename T>
-void convolve_nchw(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void convolve_nchw(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_UNUSED(conv_info);
 
@@ -107,72 +110,81 @@
 
     constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // We are computing the theoretical starting input starting points
-        const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
-        const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
-        const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-        const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-        // We are computing the valid initial and ending input points by checking the borders
-        const int in_w_start = std::max(in_w_start_t, 0);
-        const int in_h_start = std::max(in_h_start_t, 0);
-        const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-        const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-        // We use the input points to select the valid weight points to use
-        const int wei_w_start = in_w_start - in_w_start_t;
-        const int wei_h_start = in_h_start - in_h_start_t;
-        const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-        const int      index_c_end  = weights->info()->dimension(2);
-        const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-        execute_window_loop(window_w, [&](const Coordinates & id_w)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-            uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
-            T              out_temp          = static_cast<T>(0);
+            // We are computing the theoretical starting input starting points
+            const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left;
+            const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top;
+            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
 
-            for(int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
-            {
-                const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
-                const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
+            // We are computing the valid initial and ending input points by checking the borders
+            const int in_w_start = std::max(in_w_start_t, 0);
+            const int in_h_start = std::max(in_h_start_t, 0);
+            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+            // We use the input points to select the valid weight points to use
+            const int wei_w_start = in_w_start - in_w_start_t;
+            const int wei_h_start = in_h_start - in_h_start_t;
+            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+            const int      index_c_end = weights->info()->dimension(2);
+            const T *const in_ptr_start =
+                reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                id[3] * input_stride_n;
+            execute_window_loop(
+                window_w,
+                [&](const Coordinates &id_w)
                 {
-                    const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
-                    const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
-                    int         index_w         = in_w_start;
-                    int         index_wei_w     = wei_w_start;
-                    vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_w <= ((in_w_end - num_elems_read_per_iteration)); index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_w < in_w_end; ++index_w, ++index_wei_w)
-                    {
-                        const auto src_val = *(in_ptr_row + index_w * input_stride_w);
-                        const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
-                        out_temp += src_val * w_val;
-                    }
-                }
-            }
-            *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                    const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                    uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+                    T              out_temp          = static_cast<T>(0);
 
+                    for (int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c)
+                    {
+                        const T *const in_ptr_row_0      = in_ptr_start + index_in_c * input_stride_c;
+                        const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c;
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
+                        {
+                            const T    *in_ptr_row      = in_ptr_row_0 + index_in_h * input_stride_h;
+                            const T    *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h;
+                            int         index_w         = in_w_start;
+                            int         index_wei_w     = wei_w_start;
+                            vector_type out_temp_vec    = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                            for (; index_w <= ((in_w_end - num_elems_read_per_iteration));
+                                 index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration)
+                            {
+                                const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w);
+                                const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w);
+                                out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                            }
+                            out_temp += vreduce(out_temp_vec);
+                            for (; index_w < in_w_end; ++index_w, ++index_wei_w)
+                            {
+                                const auto src_val = *(in_ptr_row + index_w * input_stride_w);
+                                const auto w_val   = *(weights_ptr_row + index_wei_w * kernel_stride_w);
+                                out_temp += src_val * w_val;
+                            }
+                        }
+                    }
+                    *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                },
+                wei);
         },
-        wei);
-    },
-    out);
+        out);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-template void convolve_nchw<float16_t>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nchw<float16_t>(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-template void convolve_nchw<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nchw<float>(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
index 9982431..36a8e76 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp

@@ -30,10 +30,11 @@
 {
 namespace kernels
 {
-void neon_fp32_nhwc_directconv2d(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void neon_fp32_nhwc_directconv2d(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     convolve_nhwc<float>(window, src, weights, dst, conv_info);
 }
 } // namespace kernels
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
index 500ad1b..f235167 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp

@@ -24,16 +24,16 @@
 
 #include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h"
 
-#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
-#include "src/core/NEON/wrapper/wrapper.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <algorithm>
 
@@ -49,12 +49,14 @@
 {
 bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
 {
-    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
+    return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 &&
+            weights->padding().right == 0);
 }
-}
+} // namespace
 
 template <typename T>
-void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
+void convolve_nhwc(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
 {
     // Declare useful types
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -97,7 +99,7 @@
     constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
 
     // nhwc optimized
-    if(have_zero_x_internal_padding(src->info(), weights->info()))
+    if (have_zero_x_internal_padding(src->info(), weights->info()))
     {
         // This function assumes that input and weights have not padding in channel
 
@@ -114,138 +116,154 @@
         * multiplication works on the correct input/weight elements.
         */
         execute_window_loop(
-            window_out, [&](const Coordinates & id)
-        {
-            /*
+            window_out,
+            [&](const Coordinates &id)
+            {
+                /*
             * In here we create theoretical indexes which then we validate for both
             * inputs and weights.
             * As a reminder, this loop take each output point in NHW, C is treated
             * in the weights loop.
             */
-            // We are computing the theoretical starting input starting points
-            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
+                // We are computing the theoretical starting input starting points
+                const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+                const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+                const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+                const int in_h_end_t   = in_h_start_t + kernel_dim_h;
 
-            // We are computing the valid initial and ending input points by checking the borders
-            const int in_w_start = std::max(in_w_start_t, 0);
-            const int in_h_start = std::max(in_h_start_t, 0);
-            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+                // We are computing the valid initial and ending input points by checking the borders
+                const int in_w_start = std::max(in_w_start_t, 0);
+                const int in_h_start = std::max(in_h_start_t, 0);
+                const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+                const int in_h_end   = std::min(in_h_end_t, input_dim_h);
 
-            // We use the input points to select the valid weight points to use
-            const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
-            const int index_h_start  = in_h_start - in_h_start_t;
-            const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
-            const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
+                // We use the input points to select the valid weight points to use
+                const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
+                const int index_h_start  = in_h_start - in_h_start_t;
+                const int index_wc_end   = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
+                const int index_h_end    = kernel_dim_h - (in_h_end_t - in_h_end);
 
-            execute_window_loop(
-                window_w, [&](const Coordinates & id_w)
-            {
-                /*
+                execute_window_loop(
+                    window_w,
+                    [&](const Coordinates &id_w)
+                    {
+                        /*
                 * This is the loop in the weights, and it goes along N (the batches)
                 * As a reminder, the batches of the weights are translated into the
                 * channels of the output
                 */
-                const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
-                                      + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
-                const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
-                uint8_t *out_ptr         = out.ptr() + id_w[3] * output_stride_c;
+                        const T *in_ptr_row =
+                            reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                            id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
+                        const T *weights_ptr_row =
+                            reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
+                        uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
 
-                T out_temp = static_cast<T>(0);
-                for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
-                {
-                    const T    *in_ptr_mover = in_ptr_row;
-                    int         index_wc     = index_wc_start;
-                    vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                    for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
-                    {
-                        const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                        const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
-                        out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
-                    }
-                    out_temp += vreduce(out_temp_vec);
-                    for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
-                    {
-                        const auto src_val = *(in_ptr_mover);
-                        const auto w_val   = *(weights_ptr_row + index_wc);
-                        out_temp += src_val * w_val;
-                    }
-                }
-                *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                        T out_temp = static_cast<T>(0);
+                        for (int index_h = index_h_start; index_h < index_h_end;
+                             ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
+                        {
+                            const T    *in_ptr_mover = in_ptr_row;
+                            int         index_wc     = index_wc_start;
+                            vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                            for (; index_wc <= index_wc_end - num_elems_read_per_iteration;
+                                 index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
+                            {
+                                const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                const auto w_vec   = wrapper::vloadq(weights_ptr_row + index_wc);
+                                out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                            }
+                            out_temp += vreduce(out_temp_vec);
+                            for (; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
+                            {
+                                const auto src_val = *(in_ptr_mover);
+                                const auto w_val   = *(weights_ptr_row + index_wc);
+                                out_temp += src_val * w_val;
+                            }
+                        }
+                        *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                    },
+                    wei);
             },
-            wei);
-        },
-        out);
+            out);
     }
     else // nhwc non optimized
     {
         execute_window_loop(
-            window_out, [&](const Coordinates & id)
-        {
-            // We are computing the theoretical starting input starting points
-            const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
-            const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
-            const int in_w_end_t   = in_w_start_t + kernel_dim_w;
-            const int in_h_end_t   = in_h_start_t + kernel_dim_h;
-
-            // We are computing the valid initial and ending input points by checking the borders
-            const int in_w_start = std::max(in_w_start_t, 0);
-            const int in_h_start = std::max(in_h_start_t, 0);
-            const int in_w_end   = std::min(in_w_end_t, input_dim_w);
-            const int in_h_end   = std::min(in_h_end_t, input_dim_h);
-
-            // We use the input points to select the valid weight points to use
-            const int wei_w_start = in_w_start - in_w_start_t;
-            const int wei_h_start = in_h_start - in_h_start_t;
-            const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
-            const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
-
-            const int      index_c_end  = weights->info()->dimension(0);
-            const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
-
-            execute_window_loop(
-                window_w, [&](const Coordinates & id_w)
+            window_out,
+            [&](const Coordinates &id)
             {
-                const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
-                uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+                // We are computing the theoretical starting input starting points
+                const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
+                const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
+                const int in_w_end_t   = in_w_start_t + kernel_dim_w;
+                const int in_h_end_t   = in_h_start_t + kernel_dim_h;
 
-                T out_temp = static_cast<T>(0);
-                for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
-                {
-                    const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
-                    const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
-                    for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
+                // We are computing the valid initial and ending input points by checking the borders
+                const int in_w_start = std::max(in_w_start_t, 0);
+                const int in_h_start = std::max(in_h_start_t, 0);
+                const int in_w_end   = std::min(in_w_end_t, input_dim_w);
+                const int in_h_end   = std::min(in_h_end_t, input_dim_h);
+
+                // We use the input points to select the valid weight points to use
+                const int wei_w_start = in_w_start - in_w_start_t;
+                const int wei_h_start = in_h_start - in_h_start_t;
+                const int wei_w_end   = kernel_dim_w - (in_w_end_t - in_w_end);
+                const int wei_h_end   = kernel_dim_h - (in_h_end_t - in_h_end);
+
+                const int      index_c_end = weights->info()->dimension(0);
+                const T *const in_ptr_start =
+                    reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) +
+                    id[3] * input_stride_n;
+
+                execute_window_loop(
+                    window_w,
+                    [&](const Coordinates &id_w)
                     {
-                        const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
-                        const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
-                        int         index_c           = 0;
-                        vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
-                        for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
+                        const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
+                        uint8_t       *out_ptr           = out.ptr() + id_w[3] * output_stride_c;
+
+                        T out_temp = static_cast<T>(0);
+                        for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end;
+                             ++index_wei_h, ++index_in_h)
                         {
-                            const auto src_vec = wrapper::vloadq(in_ptr_mover);
-                            const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
-                            out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                            const T *const in_ptr_row      = in_ptr_start + index_in_h * input_stride_h;
+                            const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
+                            for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end;
+                                 ++index_wei_w, ++index_in_w)
+                            {
+                                const T    *in_ptr_mover      = in_ptr_row + index_in_w * input_stride_w;
+                                const T    *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
+                                int         index_c           = 0;
+                                vector_type out_temp_vec      = wrapper::vdup_n(static_cast<T>(0), tag_type());
+                                for (; index_c <= index_c_end - num_elems_read_per_iteration;
+                                     index_c += num_elems_read_per_iteration,
+                                     in_ptr_mover += num_elems_read_per_iteration,
+                                     weights_ptr_mover += num_elems_read_per_iteration)
+                                {
+                                    const auto src_vec = wrapper::vloadq(in_ptr_mover);
+                                    const auto w_vec   = wrapper::vloadq(weights_ptr_mover);
+                                    out_temp_vec       = wrapper::vmla(out_temp_vec, w_vec, src_vec);
+                                }
+                                out_temp += vreduce(out_temp_vec);
+                                for (; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
+                                {
+                                    const auto src_val = *(in_ptr_mover);
+                                    const auto w_val   = *(weights_ptr_mover);
+                                    out_temp += src_val * w_val;
+                                }
+                            }
                         }
-                        out_temp += vreduce(out_temp_vec);
-                        for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
-                        {
-                            const auto src_val = *(in_ptr_mover);
-                            const auto w_val   = *(weights_ptr_mover);
-                            out_temp += src_val * w_val;
-                        }
-                    }
-                }
-                *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                        *(reinterpret_cast<T *>(out_ptr)) = out_temp;
+                    },
+                    wei);
             },
-            wei);
-        },
-        out);
+            out);
     }
 }
 
-template void convolve_nhwc<float>(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+template void convolve_nhwc<float>(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 
 } // namespace kernels
 } // namespace cpu

diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
index 3b26fcd..efb9ce8 100644
--- a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h
+++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h

@@ -26,6 +26,7 @@
 #define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 
 namespace arm_compute
@@ -35,7 +36,8 @@
 namespace kernels
 {
 template <typename T>
-void convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
+void convolve_nhwc(
+    const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info);
 } // namespace kernels
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
index 6091ef2..9b4375f 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp16.cpp

@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -35,14 +36,38 @@
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float16_t, 8>>(in1, in2, out, window);
 }
 
-template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+template void neon_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
 
 template <ComparisonOperation op>
 void neon_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -50,12 +75,30 @@
     return elementwise_comp_op_16<op, float16_t, float16x8_t>(in1, in2, out, window);
 }
 
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)

diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
index 2d8fec9..53ccd89 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/fp32.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -34,25 +35,67 @@
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<float, 4>>(in1, in2, out, window);
 }
 
-template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
+template void neon_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                       const ITensor *in2,
+                                                                       ITensor       *out,
+                                                                       const Window  &window);
 
 template <ComparisonOperation op>
 void neon_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_32<op, float, float32x4_t>(in1, in2, out, window);
 }
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
index 98b154e..98f7e8b 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/impl.h

@@ -39,7 +39,7 @@
 
     vec_type res = wrapper::vdup_n(static_cast<scalar_type>(0), tag_type{});
 
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::MAX:
             res = wrapper::vmax(a, b);
@@ -71,7 +71,9 @@
 }
 
 template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a, const ScalarType &broadcast_value, const bool reorder)
+typename VectorType::type elementwise_arithm_op_broadcast(const typename VectorType::type &a,
+                                                          const ScalarType                &broadcast_value,
+                                                          const bool                       reorder)
 {
     using tag_type = typename VectorType::tag_type;
     using vec_type = typename VectorType::type;
@@ -81,10 +83,15 @@
 }
 
 template <typename InputScalarType, typename OutputScalarType, typename InputVectorType>
-void elementwise_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
-                    int (*broadcast_func)(int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
-                    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
+void elementwise_op(
+    const ITensor *in1,
+    const ITensor *in2,
+    ITensor       *out,
+    const Window  &window,
+    OutputScalarType (*scalar_func)(const InputScalarType &, const InputScalarType &),
+    int (*broadcast_func)(
+        int, int, int, const InputScalarType *, const InputScalarType &, OutputScalarType *, const bool),
+    int (*neon_func)(int, int, int, const InputScalarType *, const InputScalarType *, OutputScalarType *))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -99,7 +106,7 @@
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -114,20 +121,26 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_value, output_ptr, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a      = *(non_broadcast_input_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a, !is_broadcast_input_2 ? a : broadcast_value);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_value, output_ptr, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a      = *(non_broadcast_input_ptr + x);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? broadcast_value : a,
+                                                       !is_broadcast_input_2 ? a : broadcast_value);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -139,21 +152,23 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto a      = *(input1_ptr + x);
-                const auto b      = *(input2_ptr + x);
-                *(output_ptr + x) = (*scalar_func)(a, b);
-            }
-        },
-        input1, input2, output);
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr);
+                for (; x < window_end_x; ++x)
+                {
+                    const auto a      = *(input1_ptr + x);
+                    const auto b      = *(input2_ptr + x);
+                    *(output_ptr + x) = (*scalar_func)(a, b);
+                }
+            },
+            input1, input2, output);
     }
 }
 
@@ -162,7 +177,7 @@
 {
     auto res = ScalarType(0);
 
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::MAX:
             res = std::max(a, b);
@@ -183,10 +198,10 @@
         case ArithmeticOperation::DIV:
         {
             res = a / b;
-            if(std::is_integral<ScalarType>::value)
+            if (std::is_integral<ScalarType>::value)
             {
                 res = (b == 0) ? 0 : res;
-                if(static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
+                if (static_cast<int32_t>(a) % static_cast<int32_t>(b) != 0 && ((a < 0) != (b < 0)))
                 {
                     --res;
                 }
@@ -205,43 +220,56 @@
 }
 
 template <>
-inline int32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a, const int32x4_t &b)
+inline int32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<int32_t, 4>>(const int32x4_t &a,
+                                                                                                   const int32x4_t &b)
 {
     return vcvtq_s32_f32(vfloorq_f32(wrapper::vdiv(vcvtq_f32_s32(a), vcvtq_f32_s32(b))));
 }
 
 template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+                                                                                                 const float32x4_t &b)
 {
     return wrapper::vdiv(a, b);
 }
 
 template <>
-inline float32x4_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a, const float32x4_t &b)
+inline float32x4_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float, 4>>(const float32x4_t &a,
+                                                                                                   const float32x4_t &b)
 {
     return wrapper::vpow(a, b);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+inline float16x8_t elementwise_arithm_op<ArithmeticOperation::DIV, typename wrapper::traits::neon_vector<float16_t, 8>>(
+    const float16x8_t &a, const float16x8_t &b)
 {
     return wrapper::vdiv(a, b);
 }
 
 template <>
-inline float16x8_t elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(const float16x8_t &a, const float16x8_t &b)
+inline float16x8_t
+elementwise_arithm_op<ArithmeticOperation::POWER, typename wrapper::traits::neon_vector<float16_t, 8>>(
+    const float16x8_t &a, const float16x8_t &b)
 {
     return wrapper::vpow(a, b);
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const ScalarType *input1_ptr, const ScalarType *input2_ptr, ScalarType *output_ptr)
+inline int elementwise_arithm_op_loop(int               window_start_x,
+                                      int               window_end_x,
+                                      int               window_step_x,
+                                      const ScalarType *input1_ptr,
+                                      const ScalarType *input2_ptr,
+                                      ScalarType       *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a = wrapper::vloadq(input1_ptr + x);
         const auto b = wrapper::vloadq(input2_ptr + x);
@@ -251,14 +279,20 @@
 }
 
 template <ArithmeticOperation op, typename ScalarType, typename VectorType>
-inline int elementwise_arithm_op_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const ScalarType *non_broadcast_input_ptr, const ScalarType &broadcast_value, ScalarType *output_ptr, const bool reorder)
+inline int elementwise_arithm_op_broadcast_loop(int               window_start_x,
+                                                int               window_end_x,
+                                                int               window_step_x,
+                                                const ScalarType *non_broadcast_input_ptr,
+                                                const ScalarType &broadcast_value,
+                                                ScalarType       *output_ptr,
+                                                const bool        reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a = wrapper::vloadq((non_broadcast_input_ptr + x));
-        wrapper::vstore(output_ptr + x, elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
+        wrapper::vstore(output_ptr + x,
+                        elementwise_arithm_op_broadcast<op, ScalarType, VectorType>(a, broadcast_value, reorder));
     }
     return x;
 }
@@ -268,10 +302,10 @@
 {
     using scalar_type = typename VectorType::scalar_type;
 
-    elementwise_op<scalar_type, scalar_type, VectorType>(in1, in2, out, window,
-                                                         &elementwise_arithm_op_scalar<op, scalar_type>,
-                                                         &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
-                                                         &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
+    elementwise_op<scalar_type, scalar_type, VectorType>(
+        in1, in2, out, window, &elementwise_arithm_op_scalar<op, scalar_type>,
+        &elementwise_arithm_op_broadcast_loop<op, scalar_type, VectorType>,
+        &elementwise_arithm_op_loop<op, scalar_type, VectorType>);
 }
 
 template <ComparisonOperation op, typename InputScalarType>
@@ -279,7 +313,7 @@
 {
     bool res = false;
 
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             res = (a == b);
@@ -308,9 +342,9 @@
 template <ComparisonOperation op, typename InputVectorType, typename OutputVectorType>
 inline OutputVectorType elementwise_comp_op(const InputVectorType &a, const InputVectorType &b)
 {
-    OutputVectorType res = { 0, 0, 0, 0 };
+    OutputVectorType res = {0, 0, 0, 0};
 
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             res = wrapper::vceq(a, b);
@@ -338,53 +372,75 @@
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType, typename OutputVectorType>
-inline OutputVectorType elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
+inline OutputVectorType
+elementwise_comp_op_broadcast(const InputVectorType &a, const InputScalarType &broadcast_value, const bool reorder)
 {
     InputVectorType broadcast_vector = wrapper::vdup_n(broadcast_value, wrapper::traits::vector_128_tag());
-    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a, reorder ? a : broadcast_vector);
+    return elementwise_comp_op<op, InputVectorType, OutputVectorType>(reorder ? broadcast_vector : a,
+                                                                      reorder ? a : broadcast_vector);
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_8_loop(int                    window_start_x,
+                                                int                    window_end_x,
+                                                int                    window_step_x,
+                                                const InputScalarType *non_broadcast_input_ptr,
+                                                const InputScalarType &broadcast_value,
+                                                uint8_t               *output_ptr,
+                                                const bool             reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint8x16_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
         wrapper::vstore(output_ptr + x, a);
     }
     return x;
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_16_loop(int                    window_start_x,
+                                                 int                    window_end_x,
+                                                 int                    window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr,
+                                                 const InputScalarType &broadcast_value,
+                                                 uint8_t               *output_ptr,
+                                                 const bool             reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint16x8_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
         wrapper::vstore(output_ptr + x, wrapper::vmovn(a));
     }
     return x;
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_broadcast_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                 const InputScalarType *non_broadcast_input_ptr, const InputScalarType &broadcast_value, uint8_t *output_ptr, const bool reorder)
+inline int elementwise_comp_op_broadcast_32_loop(int                    window_start_x,
+                                                 int                    window_end_x,
+                                                 int                    window_step_x,
+                                                 const InputScalarType *non_broadcast_input_ptr,
+                                                 const InputScalarType &broadcast_value,
+                                                 uint8_t               *output_ptr,
+                                                 const bool             reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
-        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq(non_broadcast_input_ptr + x), broadcast_value, reorder);
+        const auto b = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq(non_broadcast_input_ptr + x + 4), broadcast_value, reorder);
         wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(a), wrapper::vmovn(b))));
     }
-    if(x <= window_end_x - 4)
+    if (x <= window_end_x - 4)
     {
-        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
-        for(int i = 0; i < 4; i++)
+        const auto a = elementwise_comp_op_broadcast<op, InputScalarType, InputVectorType, uint32x4_t>(
+            wrapper::vloadq((non_broadcast_input_ptr + x)), broadcast_value, reorder);
+        for (int i = 0; i < 4; i++)
         {
             *(output_ptr + x + i) = wrapper::vgetlane(a, i);
         }
@@ -394,11 +450,15 @@
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_8_loop(int window_start_x, int window_end_x, int window_step_x,
-                                      const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_8_loop(int                    window_start_x,
+                                      int                    window_end_x,
+                                      int                    window_step_x,
+                                      const InputScalarType *input1_ptr,
+                                      const InputScalarType *input2_ptr,
+                                      uint8_t               *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a   = wrapper::vloadq(input1_ptr + x);
         const auto b   = wrapper::vloadq(input2_ptr + x);
@@ -409,11 +469,15 @@
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_16_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_16_loop(int                    window_start_x,
+                                       int                    window_end_x,
+                                       int                    window_step_x,
+                                       const InputScalarType *input1_ptr,
+                                       const InputScalarType *input2_ptr,
+                                       uint8_t               *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const auto a   = wrapper::vloadq(input1_ptr + x);
         const auto b   = wrapper::vloadq(input2_ptr + x);
@@ -424,11 +488,15 @@
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
-inline int elementwise_comp_op_32_loop(int window_start_x, int window_end_x, int window_step_x,
-                                       const InputScalarType *input1_ptr, const InputScalarType *input2_ptr, uint8_t *output_ptr)
+inline int elementwise_comp_op_32_loop(int                    window_start_x,
+                                       int                    window_end_x,
+                                       int                    window_step_x,
+                                       const InputScalarType *input1_ptr,
+                                       const InputScalarType *input2_ptr,
+                                       uint8_t               *output_ptr)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         auto       a    = wrapper::vloadq(input1_ptr + x);
         auto       b    = wrapper::vloadq(input2_ptr + x);
@@ -438,12 +506,12 @@
         const auto res2 = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
         wrapper::vstore(output_ptr + x, wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(res), wrapper::vmovn(res2))));
     }
-    if(x <= window_end_x - 4)
+    if (x <= window_end_x - 4)
     {
         const auto a   = wrapper::vloadq(input1_ptr + x);
         const auto b   = wrapper::vloadq(input2_ptr + x);
         const auto res = elementwise_comp_op<op, InputVectorType, uint32x4_t>(a, b);
-        for(int i = 0; i < 4; i++)
+        for (int i = 0; i < 4; i++)
         {
             *(output_ptr + x + i) = wrapper::vgetlane(res, i);
         }
@@ -455,57 +523,59 @@
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_8_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_8_loop<op, InputScalarType, InputVectorType>);
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_16_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_16_loop<op, InputScalarType, InputVectorType>);
 }
 
 template <ComparisonOperation op, typename InputScalarType, typename InputVectorType>
 void elementwise_comp_op_32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
-    elementwise_op<InputScalarType, uint8_t, InputVectorType>(in1, in2, out, window,
-                                                              &elementwise_comp_op_scalar<op, InputScalarType>,
-                                                              &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
-                                                              &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
+    elementwise_op<InputScalarType, uint8_t, InputVectorType>(
+        in1, in2, out, window, &elementwise_comp_op_scalar<op, InputScalarType>,
+        &elementwise_comp_op_broadcast_32_loop<op, InputScalarType, InputVectorType>,
+        &elementwise_comp_op_32_loop<op, InputScalarType, InputVectorType>);
 }
 
 inline float32x4x4_t load_quantized(const uint8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 {
-    qasymm8x16_t        x = vld1q_u8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)), scale),
-        }
-    };
+    qasymm8x16_t        x   = vld1q_u8(input1_ptr);
+    const float32x4x4_t out = {{
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+            scale),
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(x))))), offset)),
+            scale),
+        vmulq_f32(
+            vcvtq_f32_s32(vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+            scale),
+        vmulq_f32(vcvtq_f32_s32(
+                      vsubq_s32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(x))))), offset)),
+                  scale),
+    }};
     return out;
 }
 
 inline float32x4x4_t load_quantized_signed(const int8_t *input1_ptr, const int32x4_t &offset, const float32x4_t &scale)
 {
-    qasymm8x16_signed_t x = vld1q_s8(input1_ptr);
-    const float32x4x4_t out =
-    {
-        {
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-            vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
-        }
-    };
+    qasymm8x16_signed_t x   = vld1q_s8(input1_ptr);
+    const float32x4x4_t out = {{
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_low_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_low_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+        vmulq_f32(vcvtq_f32_s32(vsubq_s32(vmovl_s16(vget_high_s16(vmovl_s8(vget_high_s8(x)))), offset)), scale),
+    }};
     return out;
 }
 
@@ -523,17 +593,15 @@
     vst1q_u8(output_ptr, vcombine_u8(pa, pb));
 }
 
-inline void store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void
+store_quantized(uint8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
 {
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
+    int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+    }};
     store_quantized(output_ptr, out);
 }
 
@@ -544,17 +612,17 @@
     vst1q_s8(output_ptr, vcombine_s8(pa, pb));
 }
 
-inline void store_quantized_signed(int8_t *output_ptr, const float32x4x4_t &rf, const float32x4_t &offset, const float32x4_t &invscale)
+inline void store_quantized_signed(int8_t              *output_ptr,
+                                   const float32x4x4_t &rf,
+                                   const float32x4_t   &offset,
+                                   const float32x4_t   &invscale)
 {
-    int32x4x4_t out =
-    {
-        {
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
-            vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
-        }
-    };
+    int32x4x4_t out = {{
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[0], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[1], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[2], invscale)),
+        vcvtq_s32_f32(vmlaq_f32(offset, rf.val[3], invscale)),
+    }};
     store_quantized_signed(output_ptr, out);
 }
 
@@ -565,7 +633,8 @@
 }
 
 template <ArithmeticOperation op>
-inline int8_t elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
+inline int8_t
+elementwise_arithm_op_quantized_signed_scalar(const float &a, const float &b, UniformQuantizationInfo qinfo)
 {
     return quantize_qasymm8_signed(elementwise_arithm_op_scalar<op>(a, b), qinfo);
 }
@@ -574,15 +643,12 @@
 float32x4x4_t elementwise_arithm_op(const float32x4x4_t &a, const float32x4x4_t &b)
 {
     using neon_vector_float = wrapper::traits::neon_vector<float, 4>;
-    float32x4x4_t out =
-    {
-        {
-            elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
-            elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
-        }
-    };
+    float32x4x4_t out       = {{
+              elementwise_arithm_op<op, neon_vector_float>(a.val[0], b.val[0]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[1], b.val[1]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[2], b.val[2]),
+              elementwise_arithm_op<op, neon_vector_float>(a.val[3], b.val[3]),
+    }};
     return out;
 }
 
@@ -596,26 +662,29 @@
 template <ComparisonOperation op>
 inline uint32x4x4_t elementwise_comp_op(const float32x4x4_t &a, const float32x4x4_t &b)
 {
-    uint32x4x4_t out =
-    {
-        {
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
-            elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])
-        }
-    };
+    uint32x4x4_t out = {{elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[0], b.val[0]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[1], b.val[1]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[2], b.val[2]),
+                         elementwise_comp_op<op, float32x4_t, uint32x4_t>(a.val[3], b.val[3])}};
     return out;
 }
 
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                                int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_arithm_op_quantized_loop(int            window_start_x,
+                                                int            window_end_x,
+                                                int            window_step_x,
+                                                const uint8_t *input1_ptr,
+                                                const uint8_t *input2_ptr,
+                                                uint8_t       *output_ptr,
+                                                int32x4_t      voffset1,
+                                                int32x4_t      voffset2,
+                                                float32x4_t    vscale1,
+                                                float32x4_t    vscale2,
+                                                float32x4_t    voffseto,
+                                                float32x4_t    invvscaleo)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         // Get inputs and compute output
         const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
@@ -627,13 +696,21 @@
 }
 
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_singed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                       const int8_t *input1_ptr, const int8_t *input2_ptr, int8_t *output_ptr,
-                                                       int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                       float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_arithm_op_quantized_singed_loop(int           window_start_x,
+                                                       int           window_end_x,
+                                                       int           window_step_x,
+                                                       const int8_t *input1_ptr,
+                                                       const int8_t *input2_ptr,
+                                                       int8_t       *output_ptr,
+                                                       int32x4_t     voffset1,
+                                                       int32x4_t     voffset2,
+                                                       float32x4_t   vscale1,
+                                                       float32x4_t   vscale2,
+                                                       float32x4_t   voffseto,
+                                                       float32x4_t   invvscaleo)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         // Get inputs and compute output
         const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
@@ -645,45 +722,71 @@
 }
 
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                          const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                          int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                          float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_arithm_op_quantized_broadcast_loop(int            window_start_x,
+                                                          int            window_end_x,
+                                                          int            window_step_x,
+                                                          const uint8_t *non_broadcast_input_ptr,
+                                                          float32x4x4_t  broadcast_vector,
+                                                          uint8_t       *output_ptr,
+                                                          int32x4_t      voffset_non_broadcast,
+                                                          float32x4_t    vscale_non_broadcast,
+                                                          float32x4_t    voffseto,
+                                                          float32x4_t    invvscaleo,
+                                                          bool           reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const float32x4x4_t rf =
+            elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized(output_ptr + x, rf, voffseto, invvscaleo);
     }
     return x;
 }
 template <ArithmeticOperation op>
-inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                                 const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, int8_t *output_ptr,
-                                                                 int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                                 float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_arithm_op_quantized_signed_broadcast_loop(int           window_start_x,
+                                                                 int           window_end_x,
+                                                                 int           window_step_x,
+                                                                 const int8_t *non_broadcast_input_ptr,
+                                                                 float32x4x4_t broadcast_vector,
+                                                                 int8_t       *output_ptr,
+                                                                 int32x4_t     voffset_non_broadcast,
+                                                                 float32x4_t   vscale_non_broadcast,
+                                                                 float32x4_t   voffseto,
+                                                                 float32x4_t   invvscaleo,
+                                                                 bool          reorder)
 {
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const float32x4x4_t rf = elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const float32x4x4_t rf =
+            elementwise_arithm_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized_signed(output_ptr + x, rf, voffseto, invvscaleo);
     }
     return x;
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_loop(int window_start_x, int window_end_x, int window_step_x,
-                                              const uint8_t *input1_ptr, const uint8_t *input2_ptr, uint8_t *output_ptr,
-                                              int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                              float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_comp_op_quantized_loop(int            window_start_x,
+                                              int            window_end_x,
+                                              int            window_step_x,
+                                              const uint8_t *input1_ptr,
+                                              const uint8_t *input2_ptr,
+                                              uint8_t       *output_ptr,
+                                              int32x4_t      voffset1,
+                                              int32x4_t      voffset2,
+                                              float32x4_t    vscale1,
+                                              float32x4_t    vscale2,
+                                              float32x4_t    voffseto,
+                                              float32x4_t    invvscaleo)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const float32x4x4_t af = load_quantized(input1_ptr + x, voffset1, vscale1);
         const float32x4x4_t bf = load_quantized(input2_ptr + x, voffset2, vscale2);
@@ -694,14 +797,22 @@
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                     const int8_t *input1_ptr, const int8_t *input2_ptr, uint8_t *output_ptr,
-                                                     int32x4_t voffset1, int32x4_t voffset2, float32x4_t vscale1, float32x4_t vscale2,
-                                                     float32x4_t voffseto, float32x4_t invvscaleo)
+inline int elementwise_comp_op_quantized_signed_loop(int           window_start_x,
+                                                     int           window_end_x,
+                                                     int           window_step_x,
+                                                     const int8_t *input1_ptr,
+                                                     const int8_t *input2_ptr,
+                                                     uint8_t      *output_ptr,
+                                                     int32x4_t     voffset1,
+                                                     int32x4_t     voffset2,
+                                                     float32x4_t   vscale1,
+                                                     float32x4_t   vscale2,
+                                                     float32x4_t   voffseto,
+                                                     float32x4_t   invvscaleo)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
         const float32x4x4_t af = load_quantized_signed(input1_ptr + x, voffset1, vscale1);
         const float32x4x4_t bf = load_quantized_signed(input2_ptr + x, voffset2, vscale2);
@@ -712,46 +823,85 @@
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                        const uint8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                        int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                        float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_comp_op_quantized_broadcast_loop(int            window_start_x,
+                                                        int            window_end_x,
+                                                        int            window_step_x,
+                                                        const uint8_t *non_broadcast_input_ptr,
+                                                        float32x4x4_t  broadcast_vector,
+                                                        uint8_t       *output_ptr,
+                                                        int32x4_t      voffset_non_broadcast,
+                                                        float32x4_t    vscale_non_broadcast,
+                                                        float32x4_t    voffseto,
+                                                        float32x4_t    invvscaleo,
+                                                        bool           reorder)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const uint32x4x4_t rf =
+            elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized(output_ptr + x, rf);
     }
     return x;
 }
 
 template <ComparisonOperation op>
-inline int elementwise_comp_op_quantized_signed_broadcast_loop(int window_start_x, int window_end_x, int window_step_x,
-                                                               const int8_t *non_broadcast_input_ptr, float32x4x4_t broadcast_vector, uint8_t *output_ptr,
-                                                               int32x4_t voffset_non_broadcast, float32x4_t vscale_non_broadcast,
-                                                               float32x4_t voffseto, float32x4_t invvscaleo, bool reorder)
+inline int elementwise_comp_op_quantized_signed_broadcast_loop(int           window_start_x,
+                                                               int           window_end_x,
+                                                               int           window_step_x,
+                                                               const int8_t *non_broadcast_input_ptr,
+                                                               float32x4x4_t broadcast_vector,
+                                                               uint8_t      *output_ptr,
+                                                               int32x4_t     voffset_non_broadcast,
+                                                               float32x4_t   vscale_non_broadcast,
+                                                               float32x4_t   voffseto,
+                                                               float32x4_t   invvscaleo,
+                                                               bool          reorder)
 {
     ARM_COMPUTE_UNUSED(voffseto, invvscaleo);
     int x = window_start_x;
-    for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    for (; x <= (window_end_x - window_step_x); x += window_step_x)
     {
-        const float32x4x4_t af = load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
-        const uint32x4x4_t  rf = elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
+        const float32x4x4_t af =
+            load_quantized_signed(non_broadcast_input_ptr + x, voffset_non_broadcast, vscale_non_broadcast);
+        const uint32x4x4_t rf =
+            elementwise_comp_op<op>(reorder ? broadcast_vector : af, reorder ? af : broadcast_vector);
         store_quantized(output_ptr + x, rf);
     }
     return x;
 }
 
-inline void elementwise_op_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
+inline void elementwise_op_quantized(const ITensor *in1,
+                                     const ITensor *in2,
+                                     ITensor       *out,
+                                     const Window  &window,
                                      uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                     int (*broadcast_func)(int, int, int, const uint8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                           float32x4_t, float32x4_t, const bool),
-                                     int (*neon_func)(int, int, int, const uint8_t *, const uint8_t *, uint8_t *,
-                                                      int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                      float32x4_t, float32x4_t))
+                                     int (*broadcast_func)(int,
+                                                           int,
+                                                           int,
+                                                           const uint8_t *,
+                                                           float32x4x4_t,
+                                                           uint8_t *,
+                                                           int32x4_t,
+                                                           float32x4_t,
+                                                           float32x4_t,
+                                                           float32x4_t,
+                                                           const bool),
+                                     int (*neon_func)(int,
+                                                      int,
+                                                      int,
+                                                      const uint8_t *,
+                                                      const uint8_t *,
+                                                      uint8_t *,
+                                                      int32x4_t,
+                                                      int32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -772,7 +922,7 @@
     const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset + 0.5f);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Select the broadcast input on the X axis
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -794,24 +944,28 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const uint8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+
+                const uint8_t       broadcast_value  = *reinterpret_cast<const uint8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_u8(broadcast_value), broadcast_qinfo);
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -834,32 +988,56 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const uint8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const uint8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-inline void elementwise_comp_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                              uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                              int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, uint8_t *, int32x4_t, float32x4_t,
-                                                                    float32x4_t, float32x4_t, const bool),
-                                              int (*neon_func)(int, int, int, const int8_t *, const int8_t *, uint8_t *,
-                                                               int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                               float32x4_t, float32x4_t))
+inline void
+elementwise_comp_quantized_signed(const ITensor *in1,
+                                  const ITensor *in2,
+                                  ITensor       *out,
+                                  const Window  &window,
+                                  uint8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                  int (*broadcast_func)(int,
+                                                        int,
+                                                        int,
+                                                        const int8_t *,
+                                                        float32x4x4_t,
+                                                        uint8_t *,
+                                                        int32x4_t,
+                                                        float32x4_t,
+                                                        float32x4_t,
+                                                        float32x4_t,
+                                                        const bool),
+                                  int (*neon_func)(int,
+                                                   int,
+                                                   int,
+                                                   const int8_t *,
+                                                   const int8_t *,
+                                                   uint8_t *,
+                                                   int32x4_t,
+                                                   int32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t,
+                                                   float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -879,7 +1057,7 @@
     const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Select the broadcast input on the X axis
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -901,24 +1079,28 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
-
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<uint8_t *>(output.ptr());
+
+                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -941,32 +1123,56 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
     }
 }
 
-inline void elementwise_op_quantized_signed(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-                                            int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
-                                            int (*broadcast_func)(int, int, int, const int8_t *, float32x4x4_t, int8_t *, int32x4_t, float32x4_t,
-                                                                  float32x4_t, float32x4_t, const bool),
-                                            int (*neon_func)(int, int, int, const int8_t *, const int8_t *, int8_t *,
-                                                             int32x4_t, int32x4_t, float32x4_t, float32x4_t,
-                                                             float32x4_t, float32x4_t))
+inline void
+elementwise_op_quantized_signed(const ITensor *in1,
+                                const ITensor *in2,
+                                ITensor       *out,
+                                const Window  &window,
+                                int8_t (*scalar_func)(const float &, const float &, UniformQuantizationInfo),
+                                int (*broadcast_func)(int,
+                                                      int,
+                                                      int,
+                                                      const int8_t *,
+                                                      float32x4x4_t,
+                                                      int8_t *,
+                                                      int32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      float32x4_t,
+                                                      const bool),
+                                int (*neon_func)(int,
+                                                 int,
+                                                 int,
+                                                 const int8_t *,
+                                                 const int8_t *,
+                                                 int8_t *,
+                                                 int32x4_t,
+                                                 int32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t,
+                                                 float32x4_t))
 {
     // Create input windows
     Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape());
@@ -986,7 +1192,7 @@
     const float32x4_t voffseto   = vdupq_n_f32(output_qinfo.offset);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / output_qinfo.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         // Select the broadcast input on the X axis
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
@@ -1008,24 +1214,28 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
-
-            const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
-            const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
-
-            int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr, broadcast_vector, output_ptr,
-                                      voffset_non_broadcast, vscale_non_broadcast, voffseto, invvscaleo, !is_broadcast_input_2);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
-                *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs, !is_broadcast_input_2 ? afs : bfs, output_qinfo);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int8_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int8_t *>(output.ptr());
+
+                const int8_t        broadcast_value  = *reinterpret_cast<const int8_t *>(broadcast_input.ptr());
+                const float32x4x4_t broadcast_vector = vdequantize(vdupq_n_s8(broadcast_value), broadcast_qinfo);
+
+                int x = (*broadcast_func)(window_start_x, window_end_x, window_step_x, non_broadcast_input_ptr,
+                                          broadcast_vector, output_ptr, voffset_non_broadcast, vscale_non_broadcast,
+                                          voffseto, invvscaleo, !is_broadcast_input_2);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(non_broadcast_input_ptr + x), non_broadcast_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(broadcast_value, broadcast_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(!is_broadcast_input_2 ? bfs : afs,
+                                                       !is_broadcast_input_2 ? afs : bfs, output_qinfo);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -1048,22 +1258,24 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
-
-            int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr, voffset1, voffset2,
-                                 vscale1, vscale2, voffseto, invvscaleo);
-            for(; x < window_end_x; ++x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
-                const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
-                *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
-            }
-        },
-        input1, input2, output);
+                const auto input1_ptr = reinterpret_cast<const int8_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int8_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
+
+                int x = (*neon_func)(window_start_x, window_end_x, window_step_x, input1_ptr, input2_ptr, output_ptr,
+                                     voffset1, voffset2, vscale1, vscale2, voffseto, invvscaleo);
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = dequantize_qasymm8_signed(*(input1_ptr + x), input1_qinfo);
+                    const float bfs   = dequantize_qasymm8_signed(*(input2_ptr + x), input2_qinfo);
+                    *(output_ptr + x) = (*scalar_func)(afs, bfs, output_qinfo);
+                }
+            },
+            input1, input2, output);
     }
 }
 

diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
index c5c528d..09ad13d 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/integer.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 namespace arm_compute
 {
@@ -33,63 +34,165 @@
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int32_t, 4>>(in1, in2, out, window);
 }
 
-template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void neon_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ArithmeticOperation op>
 void neon_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_arithm_op<op, typename wrapper::traits::neon_vector<int16_t, 8>>(in1, in2, out, window);
 }
-template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void neon_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ComparisonOperation op>
 void neon_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_8<op, uint8_t, uint8x16_t>(in1, in2, out, window);
 }
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
 
 template <ComparisonOperation op>
 void neon_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_16<op, int16_t, int16x8_t>(in1, in2, out, window);
 }
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
 
 template <ComparisonOperation op>
 void neon_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comp_op_32<op, int32_t, int32x4_t>(in1, in2, out, window);
 }
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-}
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void neon_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
index fa8e087..d891f70 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 namespace arm_compute
 {
@@ -33,27 +34,72 @@
     return elementwise_arithm_op_quantized<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+template void neon_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
 
 template <ComparisonOperation op>
-void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void neon_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+                                                const ITensor *in2,
+                                                ITensor       *out,
+                                                const Window  &window)
 {
     return elementwise_comp_op_quantized<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void neon_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
index abfdf93..b1f8e01 100644
--- a/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/neon/qasymm8_signed.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -34,27 +35,70 @@
     return elementwise_arithm_op_quantized_signed<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void neon_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
 
 template <ComparisonOperation op>
-void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void neon_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+                                                       const ITensor *in2,
+                                                       ITensor       *out,
+                                                       const Window  &window)
 {
     return elementwise_comp_op_quantized_signed<op>(in1, in2, out, window);
 }
 
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                               const ITensor *in2,
+                                                                                               ITensor       *out,
+                                                                                               const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                              const ITensor *in2,
+                                                                                              ITensor       *out,
+                                                                                              const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                           const ITensor *in2,
+                                                                                           ITensor       *out,
+                                                                                           const Window  &window);
+template void neon_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                                const ITensor *in2,
+                                                                                                ITensor       *out,
+                                                                                                const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
index 8522435..600c7f1 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp16.cpp

@@ -25,6 +25,7 @@
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
@@ -36,14 +37,38 @@
     return elementwise_arithmetic_op<float16_t>(in1, in2, out, op, window);
 }
 
-template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void sve_fp16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ComparisonOperation op>
 void sve_fp16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
@@ -51,14 +76,32 @@
     return elementwise_comparison_op<float16_t>(in1, in2, out, op, window);
 }
 
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_fp16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
index 2b479f7..832a966 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/fp32.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
@@ -34,26 +35,68 @@
     return elementwise_arithmetic_op<float32_t>(in1, in2, out, op, window);
 }
 
-template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                    const ITensor *in2,
+                                                                    ITensor       *out,
+                                                                    const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                             const ITensor *in2,
+                                                                             ITensor       *out,
+                                                                             const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
+template void sve_fp32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                      const ITensor *in2,
+                                                                      ITensor       *out,
+                                                                      const Window  &window);
 
 template <ComparisonOperation op>
 void sve_fp32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<float>(in1, in2, out, op, window);
 }
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_fp32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
index c0515f2..fa48407 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.cpp

@@ -23,7 +23,9 @@
  */
 
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
+
 #include "src/core/NEON/SVEMath.h"
+
 #include <arm_sve.h>
 
 namespace arm_compute
@@ -33,7 +35,8 @@
 using namespace arm_compute::wrapper;
 
 template <typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+void elementwise_arithmetic_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
 {
     using VectorType = typename sve_vector<ScalarType>::type;
 
@@ -51,7 +54,7 @@
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -66,37 +69,40 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto             output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto       non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const ScalarType broadcast_value         = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const auto       broadcast_vector        = svdup_n(broadcast_value);
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
-                VectorType res{};
+                auto       output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const ScalarType broadcast_value   = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const auto       broadcast_vector  = svdup_n(broadcast_value);
 
-                if(is_broadcast_input_2)
-                {
-                    res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op);
-                }
-                else
-                {
-                    res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op);
-                }
-                svst1(pg, output_ptr + x, res);
+                int x = window_start_x;
 
-                x += svcnt<ScalarType>();
-                pg = svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+                    VectorType res{};
+
+                    if (is_broadcast_input_2)
+                    {
+                        res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, non_broadcast_vector,
+                                                                                               broadcast_vector, op);
+                    }
+                    else
+                    {
+                        res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(
+                            pg, broadcast_vector, non_broadcast_vector, op);
+                    }
+                    svst1(pg, output_ptr + x, res);
+
+                    x += svcnt<ScalarType>();
+                    pg = svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -108,39 +114,46 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto in1 = svld1(pg, input1_ptr + x);
-                const auto in2 = svld1(pg, input2_ptr + x);
-                const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op);
-                svst1(pg, output_ptr + x, res);
+                auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-                x += svcnt<ScalarType>();
-                pg = svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                int x = window_start_x;
+
+                svbool_t pg = svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = svld1(pg, input1_ptr + x);
+                    const auto in2 = svld1(pg, input2_ptr + x);
+                    const auto res = elementwise_arithmetic_op<typename sve_vector<ScalarType>::type>(pg, in1, in2, op);
+                    svst1(pg, output_ptr + x, res);
+
+                    x += svcnt<ScalarType>();
+                    pg = svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
-template void elementwise_arithmetic_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
-template void elementwise_arithmetic_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<float32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<float16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
+template void elementwise_arithmetic_op<int32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ArithmeticOperation op, const Window &window);
 
 template <typename InputScalarType, typename OutputScalarType>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+void elementwise_comparison_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
 {
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+                  "input data type's width should be equal to or greater than output data type's width");
 
     using OutputVectorType = typename sve_vector<OutputScalarType>::type;
     const auto all_true_pg = svptrue<InputScalarType>();
@@ -157,7 +170,7 @@
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -172,37 +185,44 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-            const auto            broadcast_vector        = svdup_n(broadcast_value);
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto       non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
-                const svbool_t   output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-                OutputVectorType res{};
-                if(is_broadcast_input_2)
-                {
-                    res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, non_broadcast_vector, broadcast_vector, op);
-                }
-                else
-                {
-                    res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, broadcast_vector, non_broadcast_vector, op);
-                }
-                svst1(output_pg, output_ptr + x, res);
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+                const auto broadcast_vector = svdup_n(broadcast_value);
 
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                int x = window_start_x;
+
+                svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto       non_broadcast_vector = svld1(pg, non_broadcast_input_ptr + x);
+                    const svbool_t   output_pg            = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+                    OutputVectorType res{};
+                    if (is_broadcast_input_2)
+                    {
+                        res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                        typename sve_vector<OutputScalarType>::type>(
+                            pg, non_broadcast_vector, broadcast_vector, op);
+                    }
+                    else
+                    {
+                        res = elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                        typename sve_vector<OutputScalarType>::type>(
+                            pg, broadcast_vector, non_broadcast_vector, op);
+                    }
+                    svst1(output_pg, output_ptr + x, res);
+
+                    x += svcnt<InputScalarType>();
+                    pg = svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -214,37 +234,45 @@
         Iterator input2(in2, input2_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto     in1       = svld1(pg, input1_ptr + x);
-                const auto     in2       = svld1(pg, input2_ptr + x);
-                const auto     res       = elementwise_comparison_op<typename sve_vector<InputScalarType>::type, typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op);
-                const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
-                svst1(output_pg, output_ptr + x, res);
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
 
-                x += svcnt<InputScalarType>();
-                pg = svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                int x = window_start_x;
+
+                svbool_t pg = svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = svld1(pg, input1_ptr + x);
+                    const auto in2 = svld1(pg, input2_ptr + x);
+                    const auto res =
+                        elementwise_comparison_op<typename sve_vector<InputScalarType>::type,
+                                                  typename sve_vector<OutputScalarType>::type>(pg, in1, in2, op);
+                    const svbool_t output_pg = narrow_to_byte_predicate<sizeof(InputScalarType)>(pg);
+                    svst1(output_pg, output_ptr + x, res);
+
+                    x += svcnt<InputScalarType>();
+                    pg = svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 
-template void elementwise_comparison_op<float32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<float16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<uint8_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<int16_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
-template void elementwise_comparison_op<int32_t>(const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<float32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<float16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<uint8_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int16_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
+template void elementwise_comparison_op<int32_t>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const ComparisonOperation op, const Window &window);
 
 template <>
 svint32_t elementwise_pow<svint32_t>(svbool_t &pg, const svint32_t &a, const svint32_t &b)

diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
index 860c50a..4c61b9f 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/impl.h

@@ -25,6 +25,7 @@
 #define SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/core/NEON/wrapper/svtraits.h"
 
@@ -51,7 +52,7 @@
 {
     const auto all_false = svpfalse();
 
-    switch(bytewidth)
+    switch (bytewidth)
     {
         case 8:
             pg = svuzp1_b32(pg, all_false);
@@ -74,7 +75,7 @@
     using ScalarType = typename wrapper::sve_scalar<VectorType>::type;
     VectorType res{};
 
-    switch(op)
+    switch (op)
     {
         case ArithmeticOperation::MAX:
             res = svmax_z(pg, a, b);
@@ -114,11 +115,12 @@
 }
 
 template <typename InputVectorType, typename OutputVectorType>
-OutputVectorType elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
+OutputVectorType
+elementwise_comparison_op(svbool_t &pg, const InputVectorType &a, const InputVectorType &b, ComparisonOperation op)
 {
     svbool_t selection_vector{};
 
-    switch(op)
+    switch (op)
     {
         case ComparisonOperation::Equal:
             selection_vector = svcmpeq(pg, a, b);
@@ -154,10 +156,12 @@
 }
 
 template <typename ScalarType>
-void elementwise_arithmetic_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window);
+void elementwise_arithmetic_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window);
 
 template <typename ScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window);
+void elementwise_comparison_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_LIST_H */

diff --git a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
index c313fc6..f7714ff 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve/integer.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve/impl.h"
 namespace arm_compute
 {
@@ -33,64 +34,166 @@
 {
     return elementwise_arithmetic_op<int32_t>(in1, in2, out, op, window);
 }
-template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                            const ITensor *in2,
+                                                                            ITensor       *out,
+                                                                            const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void sve_s32_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
 
 template <ArithmeticOperation op>
 void sve_s16_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_arithmetic_op<int16_t>(in1, in2, out, op, window);
 }
-template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                   const ITensor *in2,
+                                                                   ITensor       *out,
+                                                                   const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                            const ITensor *in2,
+                                                                            ITensor       *out,
+                                                                            const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
+template void sve_s16_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                     const ITensor *in2,
+                                                                     ITensor       *out,
+                                                                     const Window  &window);
 
 template <ComparisonOperation op>
 void sve_u8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<uint8_t>(in1, in2, out, op, window);
 }
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                      const ITensor *in2,
+                                                                                      ITensor       *out,
+                                                                                      const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                              const ITensor *in2,
+                                                                              ITensor       *out,
+                                                                              const Window  &window);
+template void sve_u8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
 
 template <ComparisonOperation op>
 void sve_s16_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<int16_t>(in1, in2, out, op, window);
 }
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_s16_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
 
 template <ComparisonOperation op>
 void sve_s32_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     return elementwise_comparison_op<int32_t>(in1, in2, out, op, window);
 }
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                const ITensor *in2,
+                                                                                ITensor       *out,
+                                                                                const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                   const ITensor *in2,
+                                                                                   ITensor       *out,
+                                                                                   const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                  const ITensor *in2,
+                                                                                  ITensor       *out,
+                                                                                  const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve_s32_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
index 41e0ac7..7c6015d 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h

@@ -35,19 +35,14 @@
 {
     auto x = svld1(pg, ptr);
 
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
+    const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
 
     pg = svptrue_b8();
 
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
+    return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 0), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 1), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 2), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svget4(widened, 3), offset)), scale));
 }
 
 inline svfloat32x4_t load_quantized(const uint8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale)
@@ -56,28 +51,24 @@
 
     //vprint(x);
 
-    const auto widened = svcreate4(
-                             svmovlb(svmovlb(x)),
-                             svmovlt(svmovlb(x)),
-                             svmovlb(svmovlt(x)),
-                             svmovlt(svmovlt(x)));
+    const auto widened = svcreate4(svmovlb(svmovlb(x)), svmovlt(svmovlb(x)), svmovlb(svmovlt(x)), svmovlt(svmovlt(x)));
 
     pg = svptrue_b8();
 
-    return svcreate4(
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
-               svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
+    return svcreate4(svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 0)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 1)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 2)), offset)), scale),
+                     svmul_z(pg, svcvt_f32_z(pg, svsub_z(pg, svreinterpret_s32(svget4(widened, 3)), offset)), scale));
 }
 
-inline void store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void
+store_quantized(uint8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 {
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+    const auto quantized =
+        svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
 
     const auto narrowed_bottom = svqxtunt(svqxtunb(svget4(quantized, 0)), svget4(quantized, 1));
     const auto narrowed_top    = svqxtunt(svqxtunb(svget4(quantized, 2)), svget4(quantized, 3));
@@ -85,13 +76,14 @@
     svst1(pg, ptr, narrowed);
 }
 
-inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
+inline void
+store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svint32_t &offset, const svfloat32_t &inv_scale)
 {
-    const auto quantized = svcreate4(
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
-                               svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
+    const auto quantized =
+        svcreate4(svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 0), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 1), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 2), inv_scale))), offset),
+                  svadd_z(pg, svcvt_s32_z(pg, svrinta_z(pg, svmul_z(pg, svget4(data, 3), inv_scale))), offset));
 
     const auto narrowed_bottom = svqxtnt(svqxtnb(svget4(quantized, 0)), svget4(quantized, 1));
     const auto narrowed_top    = svqxtnt(svqxtnb(svget4(quantized, 2)), svget4(quantized, 3));
@@ -101,7 +93,8 @@
 }
 
 template <typename ScalarType>
-void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
+void elementwise_arithmetic_quantized_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window)
 {
     const auto all_true_pg = wrapper::svptrue<ScalarType>();
 
@@ -120,7 +113,7 @@
     const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset);
     const auto output_vscale  = svdup_n(1.f / out->info()->quantization_info().uniform().scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -128,8 +121,10 @@
         const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
         const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
 
-        const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
-        const auto broadcast_qinfo     = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+        const auto non_broadcast_qinfo =
+            is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+        const auto broadcast_qinfo =
+            is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
 
         const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
         const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
@@ -141,48 +136,52 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto             output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto       non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
-            const ScalarType broadcast_value         = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
-            const float      broadcast_value_f       = Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo);
-            const auto       in2                     = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+                auto       output_ptr              = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr());
+                const ScalarType broadcast_value   = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr());
+                const float      broadcast_value_f =
+                    Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+                const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+                                           svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
 
-                svfloat32x4_t result{};
+                int x = window_start_x;
 
-                if(!is_broadcast_input_2)
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
                 {
-                    result = svcreate4(
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op));
-                }
-                else
-                {
-                    result = svcreate4(
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                 elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
-                }
+                    const auto in1 =
+                        load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
 
-                store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+                    svfloat32x4_t result{};
 
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    if (!is_broadcast_input_2)
+                    {
+                        result =
+                            svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op));
+                    }
+                    else
+                    {
+                        result =
+                            svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+                                      elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+                    }
+
+                    store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -200,41 +199,44 @@
         const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
         const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
-                const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+                auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-                const auto result = svcreate4(
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                        elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+                int x = window_start_x;
 
-                store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+                    const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
 
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                    const auto result =
+                        svcreate4(elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op),
+                                  elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op));
+
+                    store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 
 template <typename InputScalarType, typename OutputScalarType = uint8_t>
-void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
+void elementwise_comparison_quantized_op(
+    const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window)
 {
-    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width");
+    static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType),
+                  "input data type's width should be equal to or greater than output data type's width");
 
     using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type;
     const auto all_true_pg = wrapper::svptrue<InputScalarType>();
@@ -251,7 +253,7 @@
     const auto window_end_x          = static_cast<int>(window.x().end());
     const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x();
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -259,8 +261,10 @@
         const ITensor *broadcast_tensor     = is_broadcast_input_2 ? in2 : in1;
         const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
 
-        const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
-        const auto broadcast_qinfo     = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
+        const auto non_broadcast_qinfo =
+            is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info();
+        const auto broadcast_qinfo =
+            is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info();
 
         const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset);
         const auto non_broadcast_vscale  = svdup_n(non_broadcast_qinfo.uniform().scale);
@@ -272,51 +276,63 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(out, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto                  output_ptr              = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto            non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
-            const InputScalarType broadcast_value         = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
-            const float           broadcast_value_f       = Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo);
-            const auto            in2                     = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto non_broadcast_input_ptr =
+                    reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr());
+                const InputScalarType broadcast_value =
+                    *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr());
+                const float broadcast_value_f =
+                    Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo);
+                const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f),
+                                           svdup_n(broadcast_value_f), svdup_n(broadcast_value_f));
 
-                svuint8x4_t result{};
+                int x = window_start_x;
 
-                if(!is_broadcast_input_2)
+                svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                do
                 {
-                    result = svcreate4(
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0), svget4(in1, 0), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1), svget4(in1, 1), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2), svget4(in1, 2), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 3), svget4(in1, 3), op));
-                }
-                else
-                {
-                    result = svcreate4(
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                 elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op));
-                }
+                    const auto in1 =
+                        load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale);
 
-                const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-                const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-                const auto zipped        = svzip1(zipped_bottom, zipped_top);
-                svst1(pg, output_ptr + x, zipped);
+                    svuint8x4_t result{};
 
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        broadcast_input, non_broadcast_input, output);
+                    if (!is_broadcast_input_2)
+                    {
+                        result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0),
+                                                                                                    svget4(in1, 0), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1),
+                                                                                                    svget4(in1, 1), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2),
+                                                                                                    svget4(in1, 2), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+                                               pg, svget4(in2, 3), svget4(in1, 3), op));
+                    }
+                    else
+                    {
+                        result = svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+                                                                                                    svget4(in2, 0), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+                                                                                                    svget4(in2, 1), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+                                                                                                    svget4(in2, 2), op),
+                                           elementwise_comparison_op<svfloat32_t, OutputVectorType>(
+                                               pg, svget4(in1, 3), svget4(in2, 3), op));
+                    }
+
+                    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+                    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
+                    const auto zipped        = svzip1(zipped_bottom, zipped_top);
+                    svst1(pg, output_ptr + x, zipped);
+
+                    x += wrapper::svcnt<InputScalarType>();
+                    pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -334,39 +350,44 @@
         const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset);
         const auto in2_vscale  = svdup_n(in2->info()->quantization_info().uniform().scale);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
-            const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
-
-            int x = window_start_x;
-
-            svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            do
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const auto in1    = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
-                const auto in2    = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
-                const auto result = svcreate4(
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op),
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op),
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op),
-                                        elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op));
+                auto       output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr());
+                const auto input1_ptr = reinterpret_cast<const InputScalarType *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const InputScalarType *>(input2.ptr());
 
-                const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
-                const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
-                const auto zipped        = svzip1(zipped_bottom, zipped_top);
-                svst1(pg, output_ptr + x, zipped);
+                int x = window_start_x;
 
-                x += wrapper::svcnt<InputScalarType>();
-                pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
-            }
-            while(svptest_any(all_true_pg, pg));
-        },
-        input1, input2, output);
+                svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                do
+                {
+                    const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale);
+                    const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale);
+                    const auto result =
+                        svcreate4(elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0),
+                                                                                           svget4(in2, 0), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1),
+                                                                                           svget4(in2, 1), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2),
+                                                                                           svget4(in2, 2), op),
+                                  elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3),
+                                                                                           svget4(in2, 3), op));
+
+                    const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1));
+                    const auto zipped_top    = svzip1(svget4(result, 2), svget4(result, 3));
+                    const auto zipped        = svzip1(zipped_bottom, zipped_top);
+                    svst1(pg, output_ptr + x, zipped);
+
+                    x += wrapper::svcnt<InputScalarType>();
+                    pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x);
+                } while (svptest_any(all_true_pg, pg));
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */
\ No newline at end of file
+#endif /* SRC_CORE_SVE_KERNELS_ELEMENTWISE_QUANTIZED_LIST_H */

diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
index 7435bb4..5cc6664 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
 namespace arm_compute
 {
@@ -34,27 +35,72 @@
     return elementwise_arithmetic_quantized_op<uint8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                        const ITensor *in2,
+                                                                        ITensor       *out,
+                                                                        const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
+template void sve2_qasymm8_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                          const ITensor *in2,
+                                                                          ITensor       *out,
+                                                                          const Window  &window);
 
 template <ComparisonOperation op>
-void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sve2_qasymm8_comparison_elementwise_binary(const ITensor *in1,
+                                                const ITensor *in2,
+                                                ITensor       *out,
+                                                const Window  &window)
 {
     return elementwise_comparison_quantized_op<uint8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                     const ITensor *in2,
+                                                                                     ITensor       *out,
+                                                                                     const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                       const ITensor *in2,
+                                                                                       ITensor       *out,
+                                                                                       const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                    const ITensor *in2,
+                                                                                    ITensor       *out,
+                                                                                    const Window  &window);
+template void sve2_qasymm8_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                         const ITensor *in2,
+                                                                                         ITensor       *out,
+                                                                                         const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
index 1027a1e..165e0c0 100644
--- a/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_binary/generic/sve2/qasymm8_signed.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_binary/generic/sve2/impl.h"
 namespace arm_compute
 {
@@ -34,27 +35,70 @@
     return elementwise_arithmetic_quantized_op<int8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::ADD>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SUB>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::DIV>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MIN>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::MAX>(const ITensor *in1,
+                                                                               const ITensor *in2,
+                                                                               ITensor       *out,
+                                                                               const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::SQUARED_DIFF>(const ITensor *in1,
+                                                                                        const ITensor *in2,
+                                                                                        ITensor       *out,
+                                                                                        const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::POWER>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
+template void sve2_qasymm8_signed_elementwise_binary<ArithmeticOperation::PRELU>(const ITensor *in1,
+                                                                                 const ITensor *in2,
+                                                                                 ITensor       *out,
+                                                                                 const Window  &window);
 
 template <ComparisonOperation op>
-void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void sve2_qasymm8_signed_comparison_elementwise_binary(const ITensor *in1,
+                                                       const ITensor *in2,
+                                                       ITensor       *out,
+                                                       const Window  &window)
 {
     return elementwise_comparison_quantized_op<int8_t>(in1, in2, out, op, window);
 }
 
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
-template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Equal>(const ITensor *in1,
+                                                                                            const ITensor *in2,
+                                                                                            ITensor       *out,
+                                                                                            const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::NotEqual>(const ITensor *in1,
+                                                                                               const ITensor *in2,
+                                                                                               ITensor       *out,
+                                                                                               const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Greater>(const ITensor *in1,
+                                                                                              const ITensor *in2,
+                                                                                              ITensor       *out,
+                                                                                              const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::GreaterEqual>(
+    const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::Less>(const ITensor *in1,
+                                                                                           const ITensor *in2,
+                                                                                           ITensor       *out,
+                                                                                           const Window  &window);
+template void sve2_qasymm8_signed_comparison_elementwise_binary<ComparisonOperation::LessEqual>(const ITensor *in1,
+                                                                                                const ITensor *in2,
+                                                                                                ITensor       *out,
+                                                                                                const Window  &window);
 
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
index b2833c2..2588db0 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp16.cpp

@@ -23,17 +23,19 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp16_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<__fp16>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)

diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
index 6566821..936a2e5 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/fp32.cpp

@@ -22,16 +22,18 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_fp32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<float>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
index dbc1dde..d54d398 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/impl.h
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/impl.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
@@ -36,7 +37,7 @@
 template <typename ScalarType>
 inline ScalarType elementwise_op_scalar_imp(ElementWiseUnary op, const ScalarType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             return 1 / sqrt(a);
@@ -60,7 +61,7 @@
 template <typename ScalarType, typename VectorType>
 inline VectorType elementwise_op_imp(ElementWiseUnary op, const VectorType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             return wrapper::vinvsqrt(a);
@@ -94,22 +95,24 @@
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
 
-        int x = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
-        }
-    },
-    input, output);
+            int x = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(output_ptr + x, elementwise_op_imp<ScalarType>(op, wrapper::vloadq(input_ptr + x)));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(output_ptr + x) = elementwise_op_scalar_imp(op, *(input_ptr + x));
+            }
+        },
+        input, output);
 }
 
 template <>
@@ -128,75 +131,81 @@
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int8x16_t  vout;
-        auto       output_ptr    = reinterpret_cast<int8_t *>(output.ptr());
-        const auto input_ptr     = reinterpret_cast<const int8_t *>(input.ptr());
-        const auto vconst_0_f32  = vdupq_n_f32(0);
-        auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
-
-        int x = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
+            int8x16_t  vout;
+            auto       output_ptr    = reinterpret_cast<int8_t *>(output.ptr());
+            const auto input_ptr     = reinterpret_cast<const int8_t *>(input.ptr());
+            const auto vconst_0_f32  = vdupq_n_f32(0);
+            auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
 
-            // De-quantize
-            const auto vin_deq = vdequantize(vin, qi_in);
-
-            // Perform activation
-            float32x4x4_t vtmp_deq =
+            int x = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
             {
-                {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+                // Perform activation
+                float32x4x4_t vtmp_deq = {{
                     elementwise_op_imp<float>(op, vin_deq.val[0]),
                     elementwise_op_imp<float>(op, vin_deq.val[1]),
                     elementwise_op_imp<float>(op, vin_deq.val[2]),
                     elementwise_op_imp<float>(op, vin_deq.val[3]),
-                }
-            };
+                }};
 
-            if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
-            {
-                vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
-                vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
-                vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
-                vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+                if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+                {
+                    vtmp_deq.val[0] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                    vtmp_deq.val[1] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                    vtmp_deq.val[2] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                    vtmp_deq.val[3] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
+                }
+
+                // Re-quantize to new output space
+                vout = vquantize_signed(vtmp_deq, qi_out);
+                wrapper::vstore(output_ptr + x, vout);
             }
-
-            // Re-quantize to new output space
-            vout = vquantize_signed(vtmp_deq, qi_out);
-            wrapper::vstore(output_ptr + x, vout);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_signed_t in    = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
-            qasymm8_signed_t tmp   = 0;
-            float            tmp_f = dequantize_qasymm8_signed(in, qi_in);
-            if(tmp_f <= 0.0)
+            for (; x < window_end_x; ++x)
             {
-                if(op == ElementWiseUnary::LOG)
+                qasymm8_signed_t in    = *(reinterpret_cast<const qasymm8_signed_t *>(input_ptr + x));
+                qasymm8_signed_t tmp   = 0;
+                float            tmp_f = dequantize_qasymm8_signed(in, qi_in);
+                if (tmp_f <= 0.0)
                 {
-                    tmp_f = (-128 - qi_out.offset) * qi_out.scale;
-                }
-                else if(op == ElementWiseUnary::RSQRT)
-                {
-                    tmp_f = (127 - qi_out.offset) * qi_out.scale;
+                    if (op == ElementWiseUnary::LOG)
+                    {
+                        tmp_f = (-128 - qi_out.offset) * qi_out.scale;
+                    }
+                    else if (op == ElementWiseUnary::RSQRT)
+                    {
+                        tmp_f = (127 - qi_out.offset) * qi_out.scale;
+                    }
+                    else
+                    {
+                        tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                    }
                 }
                 else
                 {
                     tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
                 }
+                tmp = quantize_qasymm8_signed(
+                    tmp_f, qi_out,
+                    RoundingPolicy::
+                        TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
+                // For aarch64 LUT is used and rounding to nearest is used
+                *(output_ptr + x) = tmp;
             }
-            else
-            {
-                tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
-            }
-            tmp = quantize_qasymm8_signed(tmp_f, qi_out, RoundingPolicy::TO_ZERO); // Set rounding policy TO_ZERO to be compatible with vquantize_signed() used above that follow same policy for armv7a.
-            // For aarch64 LUT is used and rounding to nearest is used
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 template <>
 inline void elementwise_op<uint8_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op)
@@ -215,71 +224,74 @@
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        uint8x16_t vout;
-        auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
-        auto       output_ptr    = reinterpret_cast<uint8_t *>(output.ptr());
-        const auto input_ptr     = reinterpret_cast<const uint8_t *>(input.ptr());
-        int        x             = window_start_x;
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto vin = wrapper::vloadq(input_ptr + x);
-
-            // De-quantize
-            const auto vin_deq = vdequantize(vin, qi_in);
-
-            // Perform activation
-            float32x4x4_t vtmp_deq =
+            uint8x16_t vout;
+            auto       clamped_value = (op == ElementWiseUnary::LOG) ? min_clamped_value : max_clamped_value;
+            auto       output_ptr    = reinterpret_cast<uint8_t *>(output.ptr());
+            const auto input_ptr     = reinterpret_cast<const uint8_t *>(input.ptr());
+            int        x             = window_start_x;
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
             {
-                {
+                const auto vin = wrapper::vloadq(input_ptr + x);
+
+                // De-quantize
+                const auto vin_deq = vdequantize(vin, qi_in);
+
+                // Perform activation
+                float32x4x4_t vtmp_deq = {{
                     elementwise_op_imp<float>(op, vin_deq.val[0]),
                     elementwise_op_imp<float>(op, vin_deq.val[1]),
                     elementwise_op_imp<float>(op, vin_deq.val[2]),
                     elementwise_op_imp<float>(op, vin_deq.val[3]),
+                }};
+                if ((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
+                {
+                    vtmp_deq.val[0] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
+                    vtmp_deq.val[1] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
+                    vtmp_deq.val[2] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
+                    vtmp_deq.val[3] =
+                        vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
                 }
-            };
-            if((op == ElementWiseUnary::LOG) || (op == ElementWiseUnary::RSQRT))
-            {
-                vtmp_deq.val[0] = vbslq_f32(vcleq_f32(vin_deq.val[0], vconst_0_f32), clamped_value, vtmp_deq.val[0]);
-                vtmp_deq.val[1] = vbslq_f32(vcleq_f32(vin_deq.val[1], vconst_0_f32), clamped_value, vtmp_deq.val[1]);
-                vtmp_deq.val[2] = vbslq_f32(vcleq_f32(vin_deq.val[2], vconst_0_f32), clamped_value, vtmp_deq.val[2]);
-                vtmp_deq.val[3] = vbslq_f32(vcleq_f32(vin_deq.val[3], vconst_0_f32), clamped_value, vtmp_deq.val[3]);
-            }
 
-            // Re-quantize to new output space
-            vout = vquantize(vtmp_deq, qi_out);
-            wrapper::vstore(output_ptr + x, vout);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            qasymm8_t in    = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
-            qasymm8_t tmp   = 0;
-            float     tmp_f = dequantize_qasymm8(in, qi_in);
-            if(tmp_f <= 0.0)
+                // Re-quantize to new output space
+                vout = vquantize(vtmp_deq, qi_out);
+                wrapper::vstore(output_ptr + x, vout);
+            }
+            for (; x < window_end_x; ++x)
             {
-                if(op == ElementWiseUnary::LOG)
+                qasymm8_t in    = *(reinterpret_cast<const qasymm8_t *>(input_ptr + x));
+                qasymm8_t tmp   = 0;
+                float     tmp_f = dequantize_qasymm8(in, qi_in);
+                if (tmp_f <= 0.0)
                 {
-                    tmp_f = (0 - qi_out.offset) * qi_out.scale;
-                }
-                else if(op == ElementWiseUnary::RSQRT)
-                {
-                    tmp_f = (255 - qi_out.offset) * qi_out.scale;
+                    if (op == ElementWiseUnary::LOG)
+                    {
+                        tmp_f = (0 - qi_out.offset) * qi_out.scale;
+                    }
+                    else if (op == ElementWiseUnary::RSQRT)
+                    {
+                        tmp_f = (255 - qi_out.offset) * qi_out.scale;
+                    }
+                    else
+                    {
+                        tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
+                    }
                 }
                 else
                 {
                     tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
                 }
+                tmp               = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
+                *(output_ptr + x) = tmp;
             }
-            else
-            {
-                tmp_f = elementwise_op_scalar_imp<float>(op, tmp_f);
-            }
-            tmp               = quantize_qasymm8(tmp_f, qi_out, RoundingPolicy::TO_ZERO);
-            *(output_ptr + x) = tmp;
-        }
-    },
-    input, output);
+        },
+        input, output);
 }
 
 } // namespace cpu

diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
index dfe5e30..d4daad4 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/integer.cpp

@@ -22,16 +22,18 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_s32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<int32_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
index 08bb7f2..38cb61d 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/q8.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
@@ -32,24 +33,28 @@
 
 #ifdef __aarch64__
 
-void neon_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_q8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(op);
 
-    auto win = window;
+    auto       win          = window;
     const auto window_end_x = window.x().end();
     win.set(0, Window::Dimension(0, 1, 1));
 
     Iterator src_it(in, win);
     Iterator dst_it(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &) {
-        const auto src_ptr = src_it.ptr();
-        auto dst_ptr = dst_it.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = src_it.ptr();
+            auto       dst_ptr = dst_it.ptr();
 
-        lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
-    },
-    src_it, dst_it);
+            lut_u8_neon(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+        },
+        src_it, dst_it);
 }
 
 #endif // __aarch64__

diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
index d987f77..3e4b88e 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Window.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -31,7 +32,8 @@
 {
 #ifndef __aarch64__
 // Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<uint8_t>(in, out, window, op);

diff --git a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
index e00970a..a5f4b05 100644
--- a/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/neon/qasymm8_signed.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Window.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/neon/impl.h"
 
 namespace arm_compute
@@ -31,7 +32,8 @@
 {
 #ifndef __aarch64__
 // Fallback function to be used for armv7a, for aarch64 LUT is used
-void neon_qasymm8_signed_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void neon_qasymm8_signed_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_op<int8_t>(in, out, window, op);

diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
index a883309..22ff43c 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp16.cpp

@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
 
@@ -30,11 +31,12 @@
 {
 namespace cpu
 {
-void sve_fp16_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp16_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_sve_op<float16_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
index b21ed8d..394bd47 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/fp32.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
 
@@ -30,10 +31,11 @@
 {
 namespace cpu
 {
-void sve_fp32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_fp32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_sve_op<float32_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
index a948862..5af534d 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/impl.cpp

@@ -24,6 +24,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
@@ -31,9 +32,10 @@
 namespace cpu
 {
 template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<utils::traits::is_floating_point<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             return svinvsqrt(pg, a);
@@ -55,9 +57,10 @@
 }
 
 template <typename ScalarType, typename VectorType>
-inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
+inline typename std::enable_if<std::is_integral<ScalarType>::value, VectorType>::type
+elementwise_op_sve_imp(svbool_t pg, ElementWiseUnary op, const VectorType &a)
 {
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::NEG:
             return svneg_z(pg, a);
@@ -81,23 +84,24 @@
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        int        x          = window_start_x;
-
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto vin = svld1(pg, input_ptr + x);
-            svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
-    },
-    input, output);
+            auto       output_ptr = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto input_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+            int        x          = window_start_x;
+
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto vin = svld1(pg, input_ptr + x);
+                svst1(pg, output_ptr + x, elementwise_op_sve_imp<ScalarType, decltype(vin)>(pg, op, vin));
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            } while (svptest_any(all_true_pg, pg));
+        },
+        input, output);
 }
 
 template void elementwise_sve_op<float16_t>(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op);

diff --git a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
index 068c3f7..e27fe5a 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve/integer.cpp

@@ -23,16 +23,18 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/elementwise_unary/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_s32_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve_s32_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(lut);
     return elementwise_sve_op<int32_t>(in, out, window, op);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
index 7e32f50..4e4582d 100644
--- a/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp
+++ b/src/cpu/kernels/elementwise_unary/generic/sve2/q8.cpp

@@ -23,13 +23,15 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/lut/list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_q8_elementwise_unary(const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
+void sve2_q8_elementwise_unary(
+    const ITensor *in, ITensor *out, const Window &window, ElementWiseUnary op, const uint8_t *lut)
 {
     ARM_COMPUTE_UNUSED(op);
 
@@ -40,14 +42,16 @@
     Iterator src_it(in, win);
     Iterator dst_it(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto src_ptr = src_it.ptr();
-        auto       dst_ptr = dst_it.ptr();
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            const auto src_ptr = src_it.ptr();
+            auto       dst_ptr = dst_it.ptr();
 
-        lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
-    },
-    src_it, dst_it);
+            lut_u8_sve2(lut, 1, window_end_x, &src_ptr, &dst_ptr);
+        },
+        src_it, dst_it);
 }
 
 } // namespace cpu

diff --git a/src/cpu/kernels/floor/list.h b/src/cpu/kernels/floor/list.h
index 4367e0f..5ac78df 100644
--- a/src/cpu/kernels/floor/list.h
+++ b/src/cpu/kernels/floor/list.h

@@ -28,8 +28,7 @@
 {
 namespace cpu
 {
-#define DECLARE_FLOOR_KERNEL(func_name) \
-    void func_name(const void *src, void *dst, int len)
+#define DECLARE_FLOOR_KERNEL(func_name) void func_name(const void *src, void *dst, int len)
 
 DECLARE_FLOOR_KERNEL(fp16_neon_floor);
 DECLARE_FLOOR_KERNEL(fp32_neon_floor);

diff --git a/src/cpu/kernels/floor/neon/fp16.cpp b/src/cpu/kernels/floor/neon/fp16.cpp
index f362676..f476902 100644
--- a/src/cpu/kernels/floor/neon/fp16.cpp
+++ b/src/cpu/kernels/floor/neon/fp16.cpp

@@ -45,14 +45,14 @@
     auto psrc = static_cast<const __fp16 *>(src);
     auto pdst = static_cast<__fp16 *>(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_f16(pdst, vfloorq_f16(vld1q_f16(psrc)));
         psrc += step;
         pdst += step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *pdst = std::floor(*psrc);
         ++psrc;

diff --git a/src/cpu/kernels/floor/neon/fp32.cpp b/src/cpu/kernels/floor/neon/fp32.cpp
index f5efb2e..a86e24d 100644
--- a/src/cpu/kernels/floor/neon/fp32.cpp
+++ b/src/cpu/kernels/floor/neon/fp32.cpp

@@ -43,14 +43,14 @@
     auto psrc = static_cast<const float *>(src);
     auto pdst = static_cast<float *>(dst);
 
-    for(; len >= step; len -= step)
+    for (; len >= step; len -= step)
     {
         vst1q_f32(pdst, vfloorq_f32(vld1q_f32(psrc)));
         psrc += step;
         pdst += step;
     }
 
-    for(; len > 0; --len)
+    for (; len > 0; --len)
     {
         *pdst = std::floor(*psrc);
         ++pdst;

diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
index a29ee76..2821af3 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp16.cpp

@@ -29,11 +29,19 @@
 {
 namespace cpu
 {
-void fused_batch_normalization_conv_f16(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv_f16(const ITensor *conv_weights,
+                                        const ITensor *conv_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
-    return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias,
-                                                     bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_conv<float16_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+                                                     bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
index 076e976..3ca5b69 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/fp32.cpp

@@ -28,11 +28,19 @@
 {
 namespace cpu
 {
-void fused_batch_normalization_conv_f32(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv_f32(const ITensor *conv_weights,
+                                        const ITensor *conv_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
-    return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias,
-                                                     bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_conv<float32_t>(conv_weights, conv_bias, fused_weights, fused_bias, bn_mean,
+                                                     bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
index b901760..6fa8432 100644
--- a/src/cpu/kernels/fuse_batch_normalization/generic/impl.h
+++ b/src/cpu/kernels/fuse_batch_normalization/generic/impl.h

@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -32,8 +33,16 @@
 namespace cpu
 {
 template <typename T>
-void fused_batch_normalization_conv(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                    const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_conv(const ITensor *conv_weights,
+                                    const ITensor *conv_bias,
+                                    ITensor       *fused_weights,
+                                    ITensor       *fused_bias,
+                                    const ITensor *bn_mean,
+                                    const ITensor *bn_var,
+                                    const ITensor *bn_beta,
+                                    const ITensor *bn_gamma,
+                                    float          epsilon,
+                                    const Window  &window)
 {
     using ScalarType   = T;
     const int size     = 16 / conv_weights->info()->element_size();
@@ -53,13 +62,20 @@
     Iterator conv_w_in(conv_weights, win);
     Iterator conv_w_out(run_in_place_weights ? conv_weights : fused_weights, win);
 
-    const auto conv_bias_in  = (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       conv_bias_out = (run_in_place_bias ? conv_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+    const auto conv_bias_in =
+        (conv_bias != nullptr ? reinterpret_cast<ScalarType *>(conv_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto conv_bias_out =
+        (run_in_place_bias ? conv_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
 
     const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
 
     auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
     auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
@@ -73,59 +89,61 @@
     auto gamma               = ScalarType(1.0);
     auto beta                = ScalarType(0.0);
     auto conv_bias_in_scalar = ScalarType(0.0);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        var = input_var[id[3]];
-        if(input_gamma != nullptr)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            gamma = input_gamma[id[3]];
-        }
-
-        if((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
-        {
-            if(input_beta != nullptr)
+            var = input_var[id[3]];
+            if (input_gamma != nullptr)
             {
-                beta     = input_beta[id[3]];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                gamma = input_gamma[id[3]];
             }
 
-            // Construct vectors
-            mean     = input_mean[id[3]];
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-
-            if(conv_bias_in != nullptr)
+            if ((id[0] == 0) && (id[1] == 0) && (id[2] == 0))
             {
-                conv_bias_in_scalar = conv_bias_in[id[3]];
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id[3]];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                // Construct vectors
+                mean     = input_mean[id[3]];
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+
+                if (conv_bias_in != nullptr)
+                {
+                    conv_bias_in_scalar = conv_bias_in[id[3]];
+                }
+                auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                conv_bias_out[id[3]]      = (conv_bias_tmp_scalar * gamma) + beta;
             }
-            auto conv_bias_tmp_scalar = (conv_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-            conv_bias_out[id[3]]      = (conv_bias_tmp_scalar * gamma) + beta;
-        }
 
-        int  x              = window_start_x;
-        auto conv_w_in_ptr  = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
-        auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
-        var_vec             = wrapper::vdup_n(var, ExactTagType{});
-        gamma_vec           = wrapper::vdup_n(gamma, ExactTagType{});
-        rvar_vec            = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+            int  x              = window_start_x;
+            auto conv_w_in_ptr  = reinterpret_cast<const ScalarType *>(conv_w_in.ptr());
+            auto conv_w_out_ptr = reinterpret_cast<ScalarType *>(conv_w_out.ptr());
+            var_vec             = wrapper::vdup_n(var, ExactTagType{});
+            gamma_vec           = wrapper::vdup_n(gamma, ExactTagType{});
+            rvar_vec            = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
 
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto wn = wrapper::vloadq(conv_w_in_ptr + x);
-            wn      = wrapper::vmul(wn, rvar_vec);
-            wn      = wrapper::vmul(wn, gamma_vec);
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto wn = wrapper::vloadq(conv_w_in_ptr + x);
+                wn      = wrapper::vmul(wn, rvar_vec);
+                wn      = wrapper::vmul(wn, gamma_vec);
 
-            // Store results
-            wrapper::vstore(conv_w_out_ptr + x, wn);
-        }
+                // Store results
+                wrapper::vstore(conv_w_out_ptr + x, wn);
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    conv_w_in, conv_w_out);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(conv_w_out_ptr + x) = *(conv_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        conv_w_in, conv_w_out);
 }
-}
-}
+} // namespace cpu
+} // namespace arm_compute
 #endif //SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_GENERIC_IMPL_H

diff --git a/src/cpu/kernels/fuse_batch_normalization/list.h b/src/cpu/kernels/fuse_batch_normalization/list.h
index e25b1e5..a03dd74 100644
--- a/src/cpu/kernels/fuse_batch_normalization/list.h
+++ b/src/cpu/kernels/fuse_batch_normalization/list.h

@@ -30,15 +30,18 @@
 {
 #define DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(func_name)                                                            \
     void func_name(const ITensor *conv_weights, const ITensor *conv_bias, ITensor *fused_weights, ITensor *fused_bias, \
-                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,     \
+                   float epsilon, const Window &window)
 
 #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL(func_name)                                                 \
     void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
-                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,   \
+                   float epsilon, const Window &window)
 
 #define DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL(func_name)                                                 \
     void func_name(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias, \
-                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+                   const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma,   \
+                   float epsilon, const Window &window)
 
 DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f16);
 DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL(fused_batch_normalization_conv_f32);
@@ -50,7 +53,7 @@
 #undef DECLARE_FUSE_BATCH_NORMALIZE_CONV_KERNEL
 #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NCHW_CONV_KERNEL
 #undef DECLARE_FUSE_BATCH_NORMALIZE_DWC_NHWC_CONV_KERNEL
-}
-}
+} // namespace cpu
+} // namespace arm_compute
 
-#endif //
\ No newline at end of file
+#endif //

diff --git a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
index 1e3be87..c0b0dfd 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nchw/all.cpp

@@ -29,8 +29,16 @@
 namespace cpu
 {
 template <typename T>
-void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw(const ITensor *dwc_weights,
+                                        const ITensor *dwc_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
     using ScalarType   = T;
     const int size     = 16 / dwc_weights->info()->element_size();
@@ -50,13 +58,20 @@
     Iterator dwc_w_in(dwc_weights, win);
     Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
 
-    const auto dwc_bias_in  = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+    const auto dwc_bias_in =
+        (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto dwc_bias_out =
+        (run_in_place_bias ? dwc_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
 
     const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
 
     auto       mean_vec    = wrapper::vdup_n(ScalarType(0), ExactTagType{});
     auto       var_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
@@ -70,74 +85,92 @@
     auto gamma              = ScalarType(1.0);
     auto beta               = ScalarType(0.0);
     auto dwc_bias_in_scalar = ScalarType(0.0);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        var = input_var[id[2]];
-        if(input_gamma != nullptr)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            gamma = input_gamma[id[2]];
-        }
-
-        if(id[1] == 0)
-        {
-            mean = input_mean[id[2]];
-
-            // Construct vectors
-            mean_vec = wrapper::vdup_n(mean, ExactTagType{});
-            if(input_beta != nullptr)
+            var = input_var[id[2]];
+            if (input_gamma != nullptr)
             {
-                beta     = input_beta[id[2]];
-                beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                gamma = input_gamma[id[2]];
             }
 
-            if(dwc_bias_in != nullptr)
+            if (id[1] == 0)
             {
-                dwc_bias_in_scalar = dwc_bias_in[id[2]];
+                mean = input_mean[id[2]];
+
+                // Construct vectors
+                mean_vec = wrapper::vdup_n(mean, ExactTagType{});
+                if (input_beta != nullptr)
+                {
+                    beta     = input_beta[id[2]];
+                    beta_vec = wrapper::vdup_n(beta, ExactTagType{});
+                }
+
+                if (dwc_bias_in != nullptr)
+                {
+                    dwc_bias_in_scalar = dwc_bias_in[id[2]];
+                }
+
+                auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                dwc_bias_out[id[2]]      = (dwc_bias_tmp_scalar * gamma) + beta;
             }
 
-            auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-            dwc_bias_out[id[2]]      = (dwc_bias_tmp_scalar * gamma) + beta;
-        }
+            int  x             = window_start_x;
+            auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+            auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+            var_vec            = wrapper::vdup_n(var, ExactTagType{});
+            gamma_vec          = wrapper::vdup_n(gamma, ExactTagType{});
+            rvar_vec           = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
 
-        int  x             = window_start_x;
-        auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-        auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-        var_vec            = wrapper::vdup_n(var, ExactTagType{});
-        gamma_vec          = wrapper::vdup_n(gamma, ExactTagType{});
-        rvar_vec           = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
+                wn      = wrapper::vmul(wn, rvar_vec);
+                wn      = wrapper::vmul(wn, gamma_vec);
 
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto wn = wrapper::vloadq(dwc_w_in_ptr + x);
-            wn      = wrapper::vmul(wn, rvar_vec);
-            wn      = wrapper::vmul(wn, gamma_vec);
+                // Store results
+                wrapper::vstore(dwc_w_out_ptr + x, wn);
+            }
 
-            // Store results
-            wrapper::vstore(dwc_w_out_ptr + x, wn);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    dwc_w_in, dwc_w_out);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
+            }
+        },
+        dwc_w_in, dwc_w_out);
 }
 
-void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw_f32(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nchw<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
-void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nchw_f16(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nchw<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
 

diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
index 275211f..1d88d3b 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp16.cpp

@@ -30,11 +30,19 @@
 {
 namespace cpu
 {
-void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc_f16(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nhwc<float16_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 
 } // namespace cpu

diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
index 67169c5..1f336bb 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/fp32.cpp

@@ -29,11 +29,19 @@
 {
 namespace cpu
 {
-void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                            const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc_f32(const ITensor *dwc_weights,
+                                            const ITensor *dwc_bias,
+                                            ITensor       *fused_weights,
+                                            ITensor       *fused_bias,
+                                            const ITensor *bn_mean,
+                                            const ITensor *bn_var,
+                                            const ITensor *bn_beta,
+                                            const ITensor *bn_gamma,
+                                            float          epsilon,
+                                            const Window  &window)
 {
-    return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias,
-                                                         bn_mean, bn_var, bn_beta, bn_gamma, epsilon, window);
+    return fused_batch_normalization_dwc_nhwc<float32_t>(dwc_weights, dwc_bias, fused_weights, fused_bias, bn_mean,
+                                                         bn_var, bn_beta, bn_gamma, epsilon, window);
 }
 
 } // namespace cpu

diff --git a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
index 6f03862..5b74a7a 100644
--- a/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h
+++ b/src/cpu/kernels/fuse_batch_normalization/nhwc/neon/impl.h

@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_KERNELS_FUSE_BATCH_NORMALIZATION_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -32,8 +33,16 @@
 namespace cpu
 {
 template <typename T>
-void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights, const ITensor *dwc_bias, ITensor *fused_weights, ITensor *fused_bias,
-                                        const ITensor *bn_mean, const ITensor *bn_var, const ITensor *bn_beta, const ITensor *bn_gamma, float epsilon, const Window &window)
+void fused_batch_normalization_dwc_nhwc(const ITensor *dwc_weights,
+                                        const ITensor *dwc_bias,
+                                        ITensor       *fused_weights,
+                                        ITensor       *fused_bias,
+                                        const ITensor *bn_mean,
+                                        const ITensor *bn_var,
+                                        const ITensor *bn_beta,
+                                        const ITensor *bn_gamma,
+                                        float          epsilon,
+                                        const Window  &window)
 {
     using ScalarType   = T;
     const int size     = 16 / dwc_weights->info()->element_size();
@@ -53,13 +62,20 @@
     Iterator dwc_w_in(dwc_weights, win);
     Iterator dwc_w_out(run_in_place_weights ? dwc_weights : fused_weights, win);
 
-    const auto dwc_bias_in  = (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
-    auto       dwc_bias_out = (run_in_place_bias ? dwc_bias_in : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
+    const auto dwc_bias_in =
+        (dwc_bias != nullptr ? reinterpret_cast<ScalarType *>(dwc_bias->ptr_to_element(Coordinates(0, 0))) : nullptr);
+    auto dwc_bias_out =
+        (run_in_place_bias ? dwc_bias_in
+                           : reinterpret_cast<ScalarType *>(fused_bias->ptr_to_element(Coordinates(0, 0))));
 
     const auto input_mean  = reinterpret_cast<const ScalarType *>(bn_mean->ptr_to_element(Coordinates(0, 0)));
     const auto input_var   = reinterpret_cast<const ScalarType *>(bn_var->ptr_to_element(Coordinates(0, 0)));
-    const auto input_gamma = (bn_gamma != nullptr) ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
-    const auto input_beta  = (bn_beta != nullptr) ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
+    const auto input_gamma = (bn_gamma != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_gamma->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
+    const auto input_beta  = (bn_beta != nullptr)
+                                 ? reinterpret_cast<const ScalarType *>(bn_beta->ptr_to_element(Coordinates(0, 0)))
+                                 : nullptr;
 
     auto       mean_vec     = wrapper::vdup_n(ScalarType(0), ExactTagType{});
     auto       var_vec      = wrapper::vdup_n(ScalarType(0), ExactTagType{});
@@ -73,81 +89,84 @@
     auto beta               = ScalarType(0.0);
     auto dwc_bias_in_scalar = ScalarType(0);
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            var_vec = wrapper::vloadq(input_var + x);
-            if(input_gamma != nullptr)
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                gamma_vec = wrapper::vloadq(input_gamma + x);
-            }
-
-            if((id[2] == 0) && (id[1] == 0))
-            {
-                mean_vec = wrapper::vloadq(input_mean + x);
-
-                // Construct vectors
-                if(input_beta != nullptr)
+                var_vec = wrapper::vloadq(input_var + x);
+                if (input_gamma != nullptr)
                 {
-                    beta_vec = wrapper::vloadq(input_beta + x);
+                    gamma_vec = wrapper::vloadq(input_gamma + x);
                 }
 
-                if(dwc_bias_in != nullptr)
+                if ((id[2] == 0) && (id[1] == 0))
                 {
-                    dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
+                    mean_vec = wrapper::vloadq(input_mean + x);
+
+                    // Construct vectors
+                    if (input_beta != nullptr)
+                    {
+                        beta_vec = wrapper::vloadq(input_beta + x);
+                    }
+
+                    if (dwc_bias_in != nullptr)
+                    {
+                        dwc_bias_vec = wrapper::vloadq(dwc_bias_in + x);
+                    }
+
+                    auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec),
+                                                          wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
+                    dwc_bias_tmp_vec      = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
+                    wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
                 }
 
-                auto dwc_bias_tmp_vec = wrapper::vmul(wrapper::vsub(dwc_bias_vec, mean_vec), wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec)));
-                dwc_bias_tmp_vec      = wrapper::vadd(wrapper::vmul(dwc_bias_tmp_vec, gamma_vec), beta_vec);
-                wrapper::vstore(dwc_bias_out + x, dwc_bias_tmp_vec);
+                auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+                auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+
+                auto wn  = wrapper::vloadq(dwc_w_in_ptr + x);
+                rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
+                wn       = wrapper::vmul(wn, rvar_vec);
+                wn       = wrapper::vmul(wn, gamma_vec);
+
+                // Store results
+                wrapper::vstore(dwc_w_out_ptr + x, wn);
             }
 
-            auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-            auto dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-
-            auto wn  = wrapper::vloadq(dwc_w_in_ptr + x);
-            rvar_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
-            wn       = wrapper::vmul(wn, rvar_vec);
-            wn       = wrapper::vmul(wn, gamma_vec);
-
-            // Store results
-            wrapper::vstore(dwc_w_out_ptr + x, wn);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            auto var = input_var[x];
-            if(input_gamma != nullptr)
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
             {
-                gamma = input_gamma[x];
-            }
-
-            if(id[2] == 0 && id[1] == 0)
-            {
-                auto mean = input_mean[x];
-                if(input_beta != nullptr)
+                auto var = input_var[x];
+                if (input_gamma != nullptr)
                 {
-                    beta = input_beta[x];
-                }
-                if(dwc_bias_in != nullptr)
-                {
-                    dwc_bias_in_scalar = dwc_bias_in[x];
+                    gamma = input_gamma[x];
                 }
 
-                auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
-                dwc_bias_out[x]          = (dwc_bias_tmp_scalar * gamma) + beta;
+                if (id[2] == 0 && id[1] == 0)
+                {
+                    auto mean = input_mean[x];
+                    if (input_beta != nullptr)
+                    {
+                        beta = input_beta[x];
+                    }
+                    if (dwc_bias_in != nullptr)
+                    {
+                        dwc_bias_in_scalar = dwc_bias_in[x];
+                    }
+
+                    auto dwc_bias_tmp_scalar = (dwc_bias_in_scalar - mean) / std::sqrt(var + ScalarType(epsilon));
+                    dwc_bias_out[x]          = (dwc_bias_tmp_scalar * gamma) + beta;
+                }
+
+                const auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
+                auto       dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
+
+                *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
             }
-
-            const auto dwc_w_in_ptr  = reinterpret_cast<const ScalarType *>(dwc_w_in.ptr());
-            auto       dwc_w_out_ptr = reinterpret_cast<ScalarType *>(dwc_w_out.ptr());
-
-            *(dwc_w_out_ptr + x) = *(dwc_w_in_ptr + x) / std::sqrt(var + ScalarType(epsilon)) * gamma;
-        }
-    },
-    dwc_w_in, dwc_w_out);
+        },
+        dwc_w_in, dwc_w_out);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
index 505a371..4d7507a 100644
--- a/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/fp16.cpp

@@ -48,30 +48,32 @@
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            float16x8x2_t       alpha_ab = vld2q_f16(out_ptr + x);
-            const float16x8x2_t c        = vld2q_f16(in_ptr + x);
-            // Multiply matrix C by its weight and accumulate
-            alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
-            alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
+            const auto in_ptr  = reinterpret_cast<const float16_t *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<float16_t *>(out.ptr());
 
-            vst2q_f16(out_ptr + x, alpha_ab);
-        }
+            int x = window_start_x;
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8x2_t       alpha_ab = vld2q_f16(out_ptr + x);
+                const float16x8x2_t c        = vld2q_f16(in_ptr + x);
+                // Multiply matrix C by its weight and accumulate
+                alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16));
+                alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16));
 
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
-        }
-    },
-    in, out);
+                vst2q_f16(out_ptr + x, alpha_ab);
+            }
+
+            // Left-over loop
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) += *(in_ptr + x) * static_cast<float16_t>(beta);
+            }
+        },
+        in, out);
 }
 } // namespace
 void neon_fp16_gemm_matrix_add(const ITensor *src, ITensor *dst, const Window &window, float beta)

diff --git a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
index dd0384c..47de0f3 100644
--- a/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp
+++ b/src/cpu/kernels/gemm_matrix_add/generic/neon/impl.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/gemm_matrix_add/generic/neon/impl.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -44,33 +45,35 @@
     Iterator in(src, win);
     Iterator out(dst, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
-        const auto out_ptr = reinterpret_cast<float *>(out.ptr());
-
-        int x = window_start_x;
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            float32x4x4_t       alpha_ab = vld4q_f32(out_ptr + x);
-            const float32x4x4_t c        = vld4q_f32(in_ptr + x);
+            const auto in_ptr  = reinterpret_cast<const float *>(in.ptr());
+            const auto out_ptr = reinterpret_cast<float *>(out.ptr());
 
-            // Multiply matrix C by its weight and accumulate
-            alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
-            alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
-            alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
-            alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
+            int x = window_start_x;
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
+            {
+                float32x4x4_t       alpha_ab = vld4q_f32(out_ptr + x);
+                const float32x4x4_t c        = vld4q_f32(in_ptr + x);
 
-            vst4q_f32(out_ptr + x, alpha_ab);
-        }
+                // Multiply matrix C by its weight and accumulate
+                alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32);
+                alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32);
+                alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32);
+                alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32);
 
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) += *(in_ptr + x) * beta;
-        }
-    },
-    in, out);
+                vst4q_f32(out_ptr + x, alpha_ab);
+            }
+
+            // Left-over loop
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) += *(in_ptr + x) * beta;
+            }
+        },
+        in, out);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
index 8fd79f9..60fda51 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp16.cpp

@@ -32,7 +32,8 @@
 {
 namespace cpu
 {
-void vector_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void vector_matrix_multiply_f16(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
     const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
     const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / rhs->info()->element_size());
@@ -42,7 +43,8 @@
     const int window_start_x = 32 * info.thread_id;
     const int window_step_x  = 32 * info.num_threads;
     const int window_end_x   = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
-    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x, " (window_end_x - window_start_x) must be multiple of window_step_x");
+    ARM_COMPUTE_ERROR_ON_MSG((window_end_x - window_start_x) % window_step_x,
+                             " (window_end_x - window_start_x) must be multiple of window_step_x");
 
     Window win_out(window);
     win_out.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -55,7 +57,7 @@
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -70,169 +72,172 @@
 
     const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
 
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the dst.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &)
         {
-            if(x > width_matrix_b)
+            int x = window_start_x;
+            // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+            // window_end_x is computed above which may cause out-of-bound writes to the dst.
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
             {
-                return;
-            }
-
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
-
-            float16x8_t acc0 = vdupq_n_f16(0.f);
-            float16x8_t acc1 = vdupq_n_f16(0.f);
-            float16x8_t acc2 = vdupq_n_f16(0.f);
-            float16x8_t acc3 = vdupq_n_f16(0.f);
-
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
-
-                float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
-
-                matrix_b += 2 * in_b_stride;
-
-                b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-                b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
-                b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
-                b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
-                acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
-                acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
-                acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
-                acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
-
-                vec_a += 4;
-                matrix_b += 2 * in_b_stride;
-            }
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float16_t   a0  = *vec_a;
-                const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
-                const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
-                const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
-                const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
-
-                acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
-                acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
-                acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
-                acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f16(acc0, alpha_f16);
-                acc1 = vmulq_f16(acc1, alpha_f16);
-                acc2 = vmulq_f16(acc2, alpha_f16);
-                acc3 = vmulq_f16(acc3, alpha_f16);
-            }
-
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
-
-            vst1q_f16(vec_out + 0, acc0);
-            vst1q_f16(vec_out + 8, acc1);
-            vst1q_f16(vec_out + 16, acc2);
-            vst1q_f16(vec_out + 24, acc3);
-        }
-
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
-
-            float16x4_t vacc = vdup_n_f16(0.f);
-
-            auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
-            const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float16x4_t a0l = vld1_f16(vec_a);
-
-                const float16x4_t b_col =
+                if (x > width_matrix_b)
                 {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
+                    return;
+                }
 
-                vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
+                auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
 
-                matrix_b += 4 * in_b_stride;
+                float16x8_t acc0 = vdupq_n_f16(0.f);
+                float16x8_t acc1 = vdupq_n_f16(0.f);
+                float16x8_t acc2 = vdupq_n_f16(0.f);
+                float16x8_t acc3 = vdupq_n_f16(0.f);
+
+                auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+                const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4);)
+                {
+                    const float16x4_t a0l = vld1_f16(vec_a);
+
+                    float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                    float16x8_t b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                    float16x8_t b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                    float16x8_t b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                    float16x8_t b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 0));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 0));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 0));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 0));
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 1));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 1));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 1));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 1));
+
+                    matrix_b += 2 * in_b_stride;
+
+                    b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+                    b10 = vld1q_f16(matrix_b + 0 + 1 * in_b_stride);
+                    b11 = vld1q_f16(matrix_b + 8 + 1 * in_b_stride);
+                    b12 = vld1q_f16(matrix_b + 16 + 1 * in_b_stride);
+                    b13 = vld1q_f16(matrix_b + 24 + 1 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b00, a0l, 2));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b01, a0l, 2));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b02, a0l, 2));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b03, a0l, 2));
+                    acc0 = vaddq_f16(acc0, vmulq_lane_f16(b10, a0l, 3));
+                    acc1 = vaddq_f16(acc1, vmulq_lane_f16(b11, a0l, 3));
+                    acc2 = vaddq_f16(acc2, vmulq_lane_f16(b12, a0l, 3));
+                    acc3 = vaddq_f16(acc3, vmulq_lane_f16(b13, a0l, 3));
+
+                    vec_a += 4;
+                    matrix_b += 2 * in_b_stride;
+                }
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float16_t   a0  = *vec_a;
+                    const float16x8_t b00 = vld1q_f16(matrix_b + 0 + 0 * in_b_stride);
+                    const float16x8_t b01 = vld1q_f16(matrix_b + 8 + 0 * in_b_stride);
+                    const float16x8_t b02 = vld1q_f16(matrix_b + 16 + 0 * in_b_stride);
+                    const float16x8_t b03 = vld1q_f16(matrix_b + 24 + 0 * in_b_stride);
+
+                    acc0 = vaddq_f16(acc0, vmulq_n_f16(b00, a0));
+                    acc1 = vaddq_f16(acc1, vmulq_n_f16(b01, a0));
+                    acc2 = vaddq_f16(acc2, vmulq_n_f16(b02, a0));
+                    acc3 = vaddq_f16(acc3, vmulq_n_f16(b03, a0));
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc0 = vmulq_f16(acc0, alpha_f16);
+                    acc1 = vmulq_f16(acc1, alpha_f16);
+                    acc2 = vmulq_f16(acc2, alpha_f16);
+                    acc3 = vmulq_f16(acc3, alpha_f16);
+                }
+
+                auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+                vst1q_f16(vec_out + 0, acc0);
+                vst1q_f16(vec_out + 8, acc1);
+                vst1q_f16(vec_out + 16, acc2);
+                vst1q_f16(vec_out + 24, acc3);
             }
 
-            float16_t acc = vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
+            for (; x < window_end_x; ++x)
             {
-                const float16_t a0  = *vec_a;
-                const float16_t b00 = *matrix_b;
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
 
-                acc += b00 * a0;
+                auto matrix_b = reinterpret_cast<const float16_t *>(inb.ptr()) + x;
 
-                matrix_b += in_b_stride;
+                float16x4_t vacc = vdup_n_f16(0.f);
+
+                auto             vec_a          = reinterpret_cast<const float16_t *>(ina.ptr());
+                const float16_t *vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
+                {
+                    const float16x4_t a0l = vld1_f16(vec_a);
+
+                    const float16x4_t b_col = {
+                        *(matrix_b + 0 * in_b_stride),
+                        *(matrix_b + 1 * in_b_stride),
+                        *(matrix_b + 2 * in_b_stride),
+                        *(matrix_b + 3 * in_b_stride),
+                    };
+
+                    vacc = vadd_f16(vacc, vmul_f16(a0l, b_col));
+
+                    matrix_b += 4 * in_b_stride;
+                }
+
+                float16_t acc =
+                    vget_lane_f16(vacc, 0) + vget_lane_f16(vacc, 1) + vget_lane_f16(vacc, 2) + vget_lane_f16(vacc, 3);
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float16_t a0  = *vec_a;
+                    const float16_t b00 = *matrix_b;
+
+                    acc += b00 * a0;
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc *= static_cast<float16_t>(alpha);
+                }
+
+                auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
+
+                *(vec_out) = acc;
             }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= static_cast<float16_t>(alpha);
-            }
-
-            auto vec_out = reinterpret_cast<float16_t *>(out.ptr()) + x;
-
-            *(vec_out) = acc;
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void matrix_matrix_multiply_f16(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void matrix_matrix_multiply_f16(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
     ARM_COMPUTE_UNUSED(info);
-    const int    out_width            = static_cast<int>(dst->info()->dimension(0));
-    const int    out_height           = static_cast<int>(dst->info()->dimension(1));
-    const size_t in_b_stride          = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
-    const size_t out_stride           = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+    const int    out_width   = static_cast<int>(dst->info()->dimension(0));
+    const int    out_height  = static_cast<int>(dst->info()->dimension(1));
+    const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+    const size_t out_stride  = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
     const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
 
     // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
@@ -243,7 +248,7 @@
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -259,22 +264,16 @@
 
     const float16x8_t alpha_f16 = vdupq_n_f16(alpha);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
-        const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
-        auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
-        float16x8x4_t c =
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            {
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f),
-                vdupq_n_f16(0.f)
-            }
-        };
+            const auto   *mtx_a0  = reinterpret_cast<const float16_t *>(ina.ptr());
+            const auto   *mtx_b0  = reinterpret_cast<const float16_t *>(inb.ptr());
+            auto         *mtx_out = reinterpret_cast<float16_t *>(out.ptr());
+            float16x8x4_t c       = {{vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f), vdupq_n_f16(0.f)}};
 
-        /*
+            /*
         This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values)
              |a00 a01 a02 a03 | a04 a05 a06 a07|
              |a10 a11 a12 a13 | a14 a15 a16 a17|
@@ -302,111 +301,118 @@
 
         The size of the dst tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size.
         */
-        const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+            const float16_t *mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
 
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
+            for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
 
-        {
-            const float16x8_t p00 = vld1q_f16(mtx_a0);
-            const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
-
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
-            const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
-            const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
-            const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
-
-            mtx_a0 += 16;
-            mtx_b0 += 32;
-        }
-
-        for(; mtx_b0 < mtx_b0_end_addr;)
-
-        {
-            const float16x4_t p00 = vld1_f16(mtx_a0);
-            const float16x8_t q00 = vld1q_f16(mtx_b0);
-
-            c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
-            c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
-            c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
-            c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
-
-            mtx_a0 += 4;
-            mtx_b0 += 8;
-        }
-
-        if(multiply_alpha)
-        {
-            c.val[0] = vmulq_f16(c.val[0], alpha_f16);
-            c.val[1] = vmulq_f16(c.val[1], alpha_f16);
-            c.val[2] = vmulq_f16(c.val[2], alpha_f16);
-            c.val[3] = vmulq_f16(c.val[3], alpha_f16);
-        }
-
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f16(mtx_out, c.val[0]);
-            if(id.y() + 1 < out_height)
             {
-                vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
-                    }
-                }
+                const float16x8_t p00 = vld1q_f16(mtx_a0);
+                const float16x8_t p02 = vld1q_f16(mtx_a0 + 8);
+
+                const float16x8_t q00 = vld1q_f16(mtx_b0);
+                const float16x8_t q02 = vld1q_f16(mtx_b0 + 8);
+                const float16x8_t q04 = vld1q_f16(mtx_b0 + 16);
+                const float16x8_t q06 = vld1q_f16(mtx_b0 + 24);
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3)));
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7)));
+
+                mtx_a0 += 16;
+                mtx_b0 += 32;
             }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
+
+            for (; mtx_b0 < mtx_b0_end_addr;)
+
             {
-                *(mtx_out + x) = c.val[0][x];
-                if(id.y() + 1 < out_height)
+                const float16x4_t p00 = vld1_f16(mtx_a0);
+                const float16x8_t q00 = vld1q_f16(mtx_b0);
+
+                c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vget_lane_f16(p00, 0)));
+                c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vget_lane_f16(p00, 1)));
+                c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vget_lane_f16(p00, 2)));
+                c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vget_lane_f16(p00, 3)));
+
+                mtx_a0 += 4;
+                mtx_b0 += 8;
+            }
+
+            if (multiply_alpha)
+            {
+                c.val[0] = vmulq_f16(c.val[0], alpha_f16);
+                c.val[1] = vmulq_f16(c.val[1], alpha_f16);
+                c.val[2] = vmulq_f16(c.val[2], alpha_f16);
+                c.val[3] = vmulq_f16(c.val[3], alpha_f16);
+            }
+
+            if (id.x() < (out_width - 8))
+            {
+                vst1q_f16(mtx_out, c.val[0]);
+                if (id.y() + 1 < out_height)
                 {
-                    *(mtx_out + x + 1 * out_stride) = c.val[1][x];
-                    if(id.y() + 2 < out_height)
+                    vst1q_f16(mtx_out + 1 * out_stride, c.val[1]);
+                    if (id.y() + 2 < out_height)
                     {
-                        *(mtx_out + x + 2 * out_stride) = c.val[2][x];
-                        if(id.y() + 3 < out_height)
+                        vst1q_f16(mtx_out + 2 * out_stride, c.val[2]);
+                        if (id.y() + 3 < out_height)
                         {
-                            *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+                            vst1q_f16(mtx_out + 3 * out_stride, c.val[3]);
                         }
                     }
                 }
             }
-        }
-    },
-    ina, inb, out);
+            else
+            {
+                // Left-over columns
+                const int columns_left = out_width - id.x();
+                for (int x = 0; x < columns_left; ++x)
+                {
+                    *(mtx_out + x) = c.val[0][x];
+                    if (id.y() + 1 < out_height)
+                    {
+                        *(mtx_out + x + 1 * out_stride) = c.val[1][x];
+                        if (id.y() + 2 < out_height)
+                        {
+                            *(mtx_out + x + 2 * out_stride) = c.val[2][x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out + x + 3 * out_stride) = c.val[3][x];
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        ina, inb, out);
 }
 
-void neon_fp16_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+void neon_fp16_gemm_matrix_mul(const ITensor    *lhs,
+                               const ITensor    *rhs,
+                               ITensor          *dst,
+                               const Window     &window,
+                               const ThreadInfo &info,
+                               float             alpha,
+                               const bool        is_dst_vector)
 {
-    return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha);
+    return (is_dst_vector) ? vector_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha)
+                           : matrix_matrix_multiply_f16(lhs, rhs, dst, window, info, alpha);
 }
-} // namespce cpu
+} // namespace cpu
 } // namespace arm_compute
 #endif //__ARM_FEATURE_FP16_VECTOR_ARITHMETIC

diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
index 9c1f6f3..e12a312 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/fp32.cpp

@@ -28,9 +28,16 @@
 {
 namespace cpu
 {
-void neon_fp32_gemm_matrix_mul(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+void neon_fp32_gemm_matrix_mul(const ITensor    *lhs,
+                               const ITensor    *rhs,
+                               ITensor          *dst,
+                               const Window     &window,
+                               const ThreadInfo &info,
+                               float             alpha,
+                               const bool        is_dst_vector)
 {
-    return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha) : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha);
+    return (is_dst_vector) ? vector_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha)
+                           : matrix_matrix_multiply_f32(lhs, rhs, dst, window, info, alpha);
 }
-} // namespce cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace cpu
+} // namespace arm_compute

diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
index 0051d3d..404d070 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h"
+
 #include "src/core/utils/helpers/float_ops.h"
 
 #include <arm_neon.h>
@@ -31,10 +32,12 @@
 {
 namespace cpu
 {
-void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void vector_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
-    const auto width_matrix_b  = static_cast<int>(dst->info()->dimension(0));
-    const auto in_b_stride     = static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
+    const auto width_matrix_b = static_cast<int>(dst->info()->dimension(0));
+    const auto in_b_stride =
+        static_cast<int>(rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type()));
     const auto num_elems_vec_a = static_cast<int>(lhs->info()->dimension(0));
 
     // The implementation computes 16 elements per iteration
@@ -54,7 +57,7 @@
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -69,209 +72,220 @@
 
     const float32x4_t alpha_f32 = vdupq_n_f32(alpha);
 
-    execute_window_loop(win_out, [&](const Coordinates &)
-    {
-        int x = window_start_x;
-        // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
-        // window_end_x is computed above which may cause out-of-bound writes to the dst.
-        for(; x < (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_out,
+        [&](const Coordinates &)
         {
-            if(x > width_matrix_b)
+            int x = window_start_x;
+            // Here we don't check for x lower equal than (window_end_x - window_step_x) because of
+            // window_end_x is computed above which may cause out-of-bound writes to the dst.
+            for (; x < (window_end_x - window_step_x); x += window_step_x)
             {
-                return;
-            }
-
-            float32x4_t acc0 = vdupq_n_f32(0.f);
-            float32x4_t acc1 = vdupq_n_f32(0.f);
-            float32x4_t acc2 = vdupq_n_f32(0.f);
-            float32x4_t acc3 = vdupq_n_f32(0.f);
-
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif /* __arm__ */
-
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4);)
-            {
-                float32x2_t a0l = vld1_f32(vec_a);
-
-                float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
-
-                float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
-
-#if __arm__
-                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
-#endif /* __arm__ */
-
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
-
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
-
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
-
-                a0l = vld1_f32(vec_a);
-
-                b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
-
-                b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
-                b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
-                b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
-                b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
-
-                acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
-                acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
-                acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
-                acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
-
-                acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
-                acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
-                acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
-                acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
-
-                vec_a += 2;
-                matrix_b += 2 * in_b_stride;
-            }
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
-            {
-                const float a0 = *vec_a;
-
-                const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
-                const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
-                const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
-                const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
-
-                acc0 = vmlaq_n_f32(acc0, b00, a0);
-                acc1 = vmlaq_n_f32(acc1, b01, a0);
-                acc2 = vmlaq_n_f32(acc2, b02, a0);
-                acc3 = vmlaq_n_f32(acc3, b03, a0);
-
-                matrix_b += in_b_stride;
-            }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc0 = vmulq_f32(acc0, alpha_f32);
-                acc1 = vmulq_f32(acc1, alpha_f32);
-                acc2 = vmulq_f32(acc2, alpha_f32);
-                acc3 = vmulq_f32(acc3, alpha_f32);
-            }
-
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
-
-            vst1q_f32(vec_out + 0, acc0);
-            vst1q_f32(vec_out + 4, acc1);
-            vst1q_f32(vec_out + 8, acc2);
-            vst1q_f32(vec_out + 12, acc3);
-        }
-
-        // Left-over loop
-        for(; x < window_end_x; ++x)
-        {
-            if(x > width_matrix_b)
-            {
-                return;
-            }
-
-            float32x4_t vacc = vdupq_n_f32(0.f);
-
-            auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
-            auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
-#endif /* __arm__ */
-
-            auto vec_a_end_addr = vec_a + num_elems_vec_a;
-            for(; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
-            {
-                const float32x4_t a0l = vld1q_f32(vec_a);
-
-                const float32x4_t b_col =
+                if (x > width_matrix_b)
                 {
-                    *(matrix_b + 0 * in_b_stride),
-                    *(matrix_b + 1 * in_b_stride),
-                    *(matrix_b + 2 * in_b_stride),
-                    *(matrix_b + 3 * in_b_stride),
-                };
+                    return;
+                }
+
+                float32x4_t acc0 = vdupq_n_f32(0.f);
+                float32x4_t acc1 = vdupq_n_f32(0.f);
+                float32x4_t acc2 = vdupq_n_f32(0.f);
+                float32x4_t acc3 = vdupq_n_f32(0.f);
+
+                auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+                auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
 
 #if __arm__
                 asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
-                asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
 #endif /* __arm__ */
 
-                vacc = vmlaq_f32(vacc, b_col, a0l);
+                auto vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4);)
+                {
+                    float32x2_t a0l = vld1_f32(vec_a);
 
-                matrix_b += 4 * in_b_stride;
+                    float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+                    float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                    float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                    float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                    float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+#if __arm__
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif /* __arm__ */
+
+                    acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                    acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                    acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                    acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+                    acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                    acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                    acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                    acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+                    vec_a += 2;
+                    matrix_b += 2 * in_b_stride;
+
+                    a0l = vld1_f32(vec_a);
+
+                    b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+                    b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride);
+                    b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride);
+                    b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride);
+                    b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride);
+
+                    acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0);
+                    acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0);
+                    acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0);
+                    acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0);
+
+                    acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1);
+                    acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1);
+                    acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1);
+                    acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1);
+
+                    vec_a += 2;
+                    matrix_b += 2 * in_b_stride;
+                }
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float a0 = *vec_a;
+
+                    const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride);
+                    const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride);
+                    const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride);
+                    const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride);
+
+                    acc0 = vmlaq_n_f32(acc0, b00, a0);
+                    acc1 = vmlaq_n_f32(acc1, b01, a0);
+                    acc2 = vmlaq_n_f32(acc2, b02, a0);
+                    acc3 = vmlaq_n_f32(acc3, b03, a0);
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc0 = vmulq_f32(acc0, alpha_f32);
+                    acc1 = vmulq_f32(acc1, alpha_f32);
+                    acc2 = vmulq_f32(acc2, alpha_f32);
+                    acc3 = vmulq_f32(acc3, alpha_f32);
+                }
+
+                const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+
+                vst1q_f32(vec_out + 0, acc0);
+                vst1q_f32(vec_out + 4, acc1);
+                vst1q_f32(vec_out + 8, acc2);
+                vst1q_f32(vec_out + 12, acc3);
             }
 
-            float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) + vgetq_lane_f32(vacc, 3);
-
-            for(; vec_a < vec_a_end_addr; ++vec_a)
+            // Left-over loop
+            for (; x < window_end_x; ++x)
             {
-                const float a0 = *vec_a;
+                if (x > width_matrix_b)
+                {
+                    return;
+                }
 
-                const float b00 = *matrix_b;
+                float32x4_t vacc = vdupq_n_f32(0.f);
 
-                acc += b00 * a0;
+                auto vec_a    = reinterpret_cast<const float *>(ina.ptr());
+                auto matrix_b = reinterpret_cast<const float *>(inb.ptr()) + x;
 
-                matrix_b += in_b_stride;
+#if __arm__
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + in_b_stride)));
+#endif /* __arm__ */
+
+                auto vec_a_end_addr = vec_a + num_elems_vec_a;
+                for (; vec_a <= (vec_a_end_addr - 4); vec_a += 4)
+                {
+                    const float32x4_t a0l = vld1q_f32(vec_a);
+
+                    const float32x4_t b_col = {
+                        *(matrix_b + 0 * in_b_stride),
+                        *(matrix_b + 1 * in_b_stride),
+                        *(matrix_b + 2 * in_b_stride),
+                        *(matrix_b + 3 * in_b_stride),
+                    };
+
+#if __arm__
+                    asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(vec_a)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 1 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 2 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 3 * in_b_stride)));
+                    asm volatile(
+                        "PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(matrix_b + 4 * in_b_stride)));
+#endif /* __arm__ */
+
+                    vacc = vmlaq_f32(vacc, b_col, a0l);
+
+                    matrix_b += 4 * in_b_stride;
+                }
+
+                float acc = vgetq_lane_f32(vacc, 0) + vgetq_lane_f32(vacc, 1) + vgetq_lane_f32(vacc, 2) +
+                            vgetq_lane_f32(vacc, 3);
+
+                for (; vec_a < vec_a_end_addr; ++vec_a)
+                {
+                    const float a0 = *vec_a;
+
+                    const float b00 = *matrix_b;
+
+                    acc += b00 * a0;
+
+                    matrix_b += in_b_stride;
+                }
+
+                // Multiply by the weight of matrix product (alpha)
+                if (multiply_alpha)
+                {
+                    acc *= alpha;
+                }
+
+                const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
+
+                *vec_out = acc;
             }
-
-            // Multiply by the weight of matrix product (alpha)
-            if(multiply_alpha)
-            {
-                acc *= alpha;
-            }
-
-            const auto vec_out = reinterpret_cast<float *>(out.ptr()) + x;
-
-            *vec_out = acc;
-        }
-    },
-    ina, inb, out);
+        },
+        ina, inb, out);
 }
 
-void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
+void matrix_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha)
 {
     ARM_COMPUTE_UNUSED(info);
-    const int    out_width            = static_cast<int>(dst->info()->dimension(0));
-    const int    out_height           = static_cast<int>(dst->info()->dimension(1));
-    const size_t in_b_stride          = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
-    const size_t out_stride1          = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
-    const size_t out_stride2          = out_stride1 * 2;
-    const size_t out_stride3          = out_stride1 * 3;
+    const int    out_width   = static_cast<int>(dst->info()->dimension(0));
+    const int    out_height  = static_cast<int>(dst->info()->dimension(1));
+    const size_t in_b_stride = rhs->info()->strides_in_bytes()[1] / data_size_from_type(rhs->info()->data_type());
+    const size_t out_stride1 = dst->info()->strides_in_bytes()[1] / data_size_from_type(dst->info()->data_type());
+    const size_t out_stride2 = out_stride1 * 2;
+    const size_t out_stride3 = out_stride1 * 3;
     const int    num_elems_matrix_b_x = rhs->info()->dimension(0);
 
     // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the dst matrix
@@ -282,7 +296,7 @@
     Window win_b;
     // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
     // This scenario can happen when the the matrix multiplication is used to perform a convolution operation
-    if(rhs->info()->num_dimensions() >= 3)
+    if (rhs->info()->num_dimensions() >= 3)
     {
         win_b = window;
     }
@@ -302,338 +316,340 @@
     // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with CpuGemmInterleave4x4 and CpuGemmTranspose1xW
     // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration
     // All the values needed for computing a single 4x4 block will be read from consecutive memory positions
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
-        auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
-        auto mtx_b1 = mtx_b0 + in_b_stride;
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto mtx_a0 = reinterpret_cast<const float *>(ina.ptr());
+            auto mtx_b0 = reinterpret_cast<const float *>(inb.ptr());
+            auto mtx_b1 = mtx_b0 + in_b_stride;
 
-        float32x4_t acc00 = vdupq_n_f32(0.f);
-        float32x4_t acc10 = vdupq_n_f32(0.f);
-        float32x4_t acc20 = vdupq_n_f32(0.f);
-        float32x4_t acc30 = vdupq_n_f32(0.f);
+            float32x4_t acc00 = vdupq_n_f32(0.f);
+            float32x4_t acc10 = vdupq_n_f32(0.f);
+            float32x4_t acc20 = vdupq_n_f32(0.f);
+            float32x4_t acc30 = vdupq_n_f32(0.f);
 
-        float32x4_t acc01 = vdupq_n_f32(0.f);
-        float32x4_t acc11 = vdupq_n_f32(0.f);
-        float32x4_t acc21 = vdupq_n_f32(0.f);
-        float32x4_t acc31 = vdupq_n_f32(0.f);
+            float32x4_t acc01 = vdupq_n_f32(0.f);
+            float32x4_t acc11 = vdupq_n_f32(0.f);
+            float32x4_t acc21 = vdupq_n_f32(0.f);
+            float32x4_t acc31 = vdupq_n_f32(0.f);
 
 #if __arm__
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-        asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+            asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
 #endif /* __arm__ */
 
-        auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
-        for(; mtx_b0 <= (mtx_b0_end_addr - 32);)
-        {
-            float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
-
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
-            float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
-            float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
-            float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
-            float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
-            float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0 = vld1q_dup_f32(mtx_a0 + 0);
-            a1 = vld1q_dup_f32(mtx_a0 + 1);
-            a2 = vld1q_dup_f32(mtx_a0 + 2);
-            a3 = vld1q_dup_f32(mtx_a0 + 3);
-
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-
-            a0  = vld1q_dup_f32(mtx_a0 + 0);
-            a1  = vld1q_dup_f32(mtx_a0 + 1);
-            a2  = vld1q_dup_f32(mtx_a0 + 2);
-            a3  = vld1q_dup_f32(mtx_a0 + 3);
-            b00 = vld1q_f32(mtx_b0);
-            b10 = vld1q_f32(mtx_b1);
-            b01 = vld1q_f32(mtx_b0 + 4);
-            b11 = vld1q_f32(mtx_b1 + 4);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            a4 = vld1q_dup_f32(mtx_a0 + 4);
-            a5 = vld1q_dup_f32(mtx_a0 + 5);
-            a6 = vld1q_dup_f32(mtx_a0 + 6);
-            a7 = vld1q_dup_f32(mtx_a0 + 7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b01, a4);
-            acc10 = vmlaq_f32(acc10, b01, a5);
-            acc20 = vmlaq_f32(acc20, b01, a6);
-            acc30 = vmlaq_f32(acc30, b01, a7);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b11, a4);
-            acc11 = vmlaq_f32(acc11, b11, a5);
-            acc21 = vmlaq_f32(acc21, b11, a6);
-            acc31 = vmlaq_f32(acc31, b11, a7);
-
-            mtx_a0 += 8;
-            mtx_b0 += 8;
-            mtx_b1 += 8;
-        }
-
-        for(; mtx_b0 < mtx_b0_end_addr;)
-        {
-            float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
-            float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
-            float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
-            float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
-            float32x4_t b00 = vld1q_f32(mtx_b0);
-            float32x4_t b10 = vld1q_f32(mtx_b1);
-
-#if __arm__
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
-            asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
-#endif /* __arm__ */
-            // 4x4 block 0
-            acc00 = vmlaq_f32(acc00, b00, a0);
-            acc10 = vmlaq_f32(acc10, b00, a1);
-            acc20 = vmlaq_f32(acc20, b00, a2);
-            acc30 = vmlaq_f32(acc30, b00, a3);
-
-            // 4x4 block 1
-            acc01 = vmlaq_f32(acc01, b10, a0);
-            acc11 = vmlaq_f32(acc11, b10, a1);
-            acc21 = vmlaq_f32(acc21, b10, a2);
-            acc31 = vmlaq_f32(acc31, b10, a3);
-
-            mtx_a0 += 4;
-            mtx_b0 += 4;
-            mtx_b1 += 4;
-        }
-
-        // Multiply by the weight of matrix product (alpha)
-        if(multiply_alpha)
-        {
-            acc00 = vmulq_f32(acc00, alpha_f32);
-            acc10 = vmulq_f32(acc10, alpha_f32);
-            acc20 = vmulq_f32(acc20, alpha_f32);
-            acc30 = vmulq_f32(acc30, alpha_f32);
-            acc01 = vmulq_f32(acc01, alpha_f32);
-            acc11 = vmulq_f32(acc11, alpha_f32);
-            acc21 = vmulq_f32(acc21, alpha_f32);
-            acc31 = vmulq_f32(acc31, alpha_f32);
-        }
-
-        const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
-        const auto mtx_out1 = mtx_out0 + 4;
-
-        if(id.x() < (out_width - 8))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            vst1q_f32(mtx_out1, acc01);
-            if(id.y() + 1 < out_height)
+            auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x;
+            for (; mtx_b0 <= (mtx_b0_end_addr - 32);)
             {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                vst1q_f32(mtx_out1 + out_stride1, acc11);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    vst1q_f32(mtx_out1 + out_stride2, acc21);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
-                        vst1q_f32(mtx_out1 + out_stride3, acc31);
-                    }
-                }
+                float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0);
+                float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1);
+                float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2);
+                float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+                float32x4_t b00 = vld1q_f32(mtx_b0);
+                float32x4_t b10 = vld1q_f32(mtx_b1);
+                float32x4_t b01 = vld1q_f32(mtx_b0 + 4);
+                float32x4_t b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif /* __arm__ */
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4);
+                float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5);
+                float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6);
+                float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0 = vld1q_dup_f32(mtx_a0 + 0);
+                a1 = vld1q_dup_f32(mtx_a0 + 1);
+                a2 = vld1q_dup_f32(mtx_a0 + 2);
+                a3 = vld1q_dup_f32(mtx_a0 + 3);
+
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0  = vld1q_dup_f32(mtx_a0 + 0);
+                a1  = vld1q_dup_f32(mtx_a0 + 1);
+                a2  = vld1q_dup_f32(mtx_a0 + 2);
+                a3  = vld1q_dup_f32(mtx_a0 + 3);
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+#if __arm__
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif /* __arm__ */
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
+
+                a0  = vld1q_dup_f32(mtx_a0 + 0);
+                a1  = vld1q_dup_f32(mtx_a0 + 1);
+                a2  = vld1q_dup_f32(mtx_a0 + 2);
+                a3  = vld1q_dup_f32(mtx_a0 + 3);
+                b00 = vld1q_f32(mtx_b0);
+                b10 = vld1q_f32(mtx_b1);
+                b01 = vld1q_f32(mtx_b0 + 4);
+                b11 = vld1q_f32(mtx_b1 + 4);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                a4 = vld1q_dup_f32(mtx_a0 + 4);
+                a5 = vld1q_dup_f32(mtx_a0 + 5);
+                a6 = vld1q_dup_f32(mtx_a0 + 6);
+                a7 = vld1q_dup_f32(mtx_a0 + 7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b01, a4);
+                acc10 = vmlaq_f32(acc10, b01, a5);
+                acc20 = vmlaq_f32(acc20, b01, a6);
+                acc30 = vmlaq_f32(acc30, b01, a7);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b11, a4);
+                acc11 = vmlaq_f32(acc11, b11, a5);
+                acc21 = vmlaq_f32(acc21, b11, a6);
+                acc31 = vmlaq_f32(acc31, b11, a7);
+
+                mtx_a0 += 8;
+                mtx_b0 += 8;
+                mtx_b1 += 8;
             }
-        }
-        else if(id.x() < (out_width - 4))
-        {
-            vst1q_f32(mtx_out0, acc00);
-            if(id.y() + 1 < out_height)
+
+            for (; mtx_b0 < mtx_b0_end_addr;)
             {
-                vst1q_f32(mtx_out0 + out_stride1, acc10);
-                if(id.y() + 2 < out_height)
-                {
-                    vst1q_f32(mtx_out0 + out_stride2, acc20);
-                    if(id.y() + 3 < out_height)
-                    {
-                        vst1q_f32(mtx_out0 + out_stride3, acc30);
-                    }
-                }
+                float32x4_t a0  = vld1q_dup_f32(mtx_a0 + 0);
+                float32x4_t a1  = vld1q_dup_f32(mtx_a0 + 1);
+                float32x4_t a2  = vld1q_dup_f32(mtx_a0 + 2);
+                float32x4_t a3  = vld1q_dup_f32(mtx_a0 + 3);
+                float32x4_t b00 = vld1q_f32(mtx_b0);
+                float32x4_t b10 = vld1q_f32(mtx_b1);
+
+#if __arm__
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_a0)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b0)));
+                asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast<const uint8_t *>(mtx_b1)));
+#endif /* __arm__ */
+                // 4x4 block 0
+                acc00 = vmlaq_f32(acc00, b00, a0);
+                acc10 = vmlaq_f32(acc10, b00, a1);
+                acc20 = vmlaq_f32(acc20, b00, a2);
+                acc30 = vmlaq_f32(acc30, b00, a3);
+
+                // 4x4 block 1
+                acc01 = vmlaq_f32(acc01, b10, a0);
+                acc11 = vmlaq_f32(acc11, b10, a1);
+                acc21 = vmlaq_f32(acc21, b10, a2);
+                acc31 = vmlaq_f32(acc31, b10, a3);
+
+                mtx_a0 += 4;
+                mtx_b0 += 4;
+                mtx_b1 += 4;
             }
-            // Left-over columns
-            const int columns_left = out_width - id.x() - 4;
-            for(auto x = 0; x < columns_left; ++x)
+
+            // Multiply by the weight of matrix product (alpha)
+            if (multiply_alpha)
             {
-                *(mtx_out1 + x) = acc01[x];
-                if(id.y() + 1 < out_height)
+                acc00 = vmulq_f32(acc00, alpha_f32);
+                acc10 = vmulq_f32(acc10, alpha_f32);
+                acc20 = vmulq_f32(acc20, alpha_f32);
+                acc30 = vmulq_f32(acc30, alpha_f32);
+                acc01 = vmulq_f32(acc01, alpha_f32);
+                acc11 = vmulq_f32(acc11, alpha_f32);
+                acc21 = vmulq_f32(acc21, alpha_f32);
+                acc31 = vmulq_f32(acc31, alpha_f32);
+            }
+
+            const auto mtx_out0 = reinterpret_cast<float *>(out.ptr());
+            const auto mtx_out1 = mtx_out0 + 4;
+
+            if (id.x() < (out_width - 8))
+            {
+                vst1q_f32(mtx_out0, acc00);
+                vst1q_f32(mtx_out1, acc01);
+                if (id.y() + 1 < out_height)
                 {
-                    *(mtx_out1 + x + out_stride1) = acc11[x];
-                    if(id.y() + 2 < out_height)
+                    vst1q_f32(mtx_out0 + out_stride1, acc10);
+                    vst1q_f32(mtx_out1 + out_stride1, acc11);
+                    if (id.y() + 2 < out_height)
                     {
-                        *(mtx_out1 + x + out_stride2) = acc21[x];
-                        if(id.y() + 3 < out_height)
+                        vst1q_f32(mtx_out0 + out_stride2, acc20);
+                        vst1q_f32(mtx_out1 + out_stride2, acc21);
+                        if (id.y() + 3 < out_height)
                         {
-                            *(mtx_out1 + x + out_stride3) = acc31[x];
+                            vst1q_f32(mtx_out0 + out_stride3, acc30);
+                            vst1q_f32(mtx_out1 + out_stride3, acc31);
                         }
                     }
                 }
             }
-        }
-        else
-        {
-            // Left-over columns
-            const int columns_left = out_width - id.x();
-            for(int x = 0; x < columns_left; ++x)
+            else if (id.x() < (out_width - 4))
             {
-                *(mtx_out0 + x) = acc00[x];
-                if(id.y() + 1 < out_height)
+                vst1q_f32(mtx_out0, acc00);
+                if (id.y() + 1 < out_height)
                 {
-                    *(mtx_out0 + x + out_stride1) = acc10[x];
-                    if(id.y() + 2 < out_height)
+                    vst1q_f32(mtx_out0 + out_stride1, acc10);
+                    if (id.y() + 2 < out_height)
                     {
-                        *(mtx_out0 + x + out_stride2) = acc20[x];
-                        if(id.y() + 3 < out_height)
+                        vst1q_f32(mtx_out0 + out_stride2, acc20);
+                        if (id.y() + 3 < out_height)
                         {
-                            *(mtx_out0 + x + out_stride3) = acc30[x];
+                            vst1q_f32(mtx_out0 + out_stride3, acc30);
+                        }
+                    }
+                }
+                // Left-over columns
+                const int columns_left = out_width - id.x() - 4;
+                for (auto x = 0; x < columns_left; ++x)
+                {
+                    *(mtx_out1 + x) = acc01[x];
+                    if (id.y() + 1 < out_height)
+                    {
+                        *(mtx_out1 + x + out_stride1) = acc11[x];
+                        if (id.y() + 2 < out_height)
+                        {
+                            *(mtx_out1 + x + out_stride2) = acc21[x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out1 + x + out_stride3) = acc31[x];
+                            }
                         }
                     }
                 }
             }
-        }
-    },
-    ina, inb, out);
+            else
+            {
+                // Left-over columns
+                const int columns_left = out_width - id.x();
+                for (int x = 0; x < columns_left; ++x)
+                {
+                    *(mtx_out0 + x) = acc00[x];
+                    if (id.y() + 1 < out_height)
+                    {
+                        *(mtx_out0 + x + out_stride1) = acc10[x];
+                        if (id.y() + 2 < out_height)
+                        {
+                            *(mtx_out0 + x + out_stride2) = acc20[x];
+                            if (id.y() + 3 < out_height)
+                            {
+                                *(mtx_out0 + x + out_stride3) = acc30[x];
+                            }
+                        }
+                    }
+                }
+            }
+        },
+        ina, inb, out);
 }
 } // namespace cpu
 

diff --git a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
index f9f1f24..74ea4c2 100644
--- a/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h
+++ b/src/cpu/kernels/gemm_matrix_mul/generic/neon/impl.h

@@ -24,15 +24,18 @@
 #ifndef SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
 #define SRC_CORE_KERNELS_GEMMMATRIXMUL_IMPL_H
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/CPP/Validate.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void vector_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+void vector_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
 
-void matrix_matrix_multiply_f32(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
+void matrix_matrix_multiply_f32(
+    const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha);
 
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/gemm_matrix_mul/list.h b/src/cpu/kernels/gemm_matrix_mul/list.h
index 9cdb58a..15b23b1 100644
--- a/src/cpu/kernels/gemm_matrix_mul/list.h
+++ b/src/cpu/kernels/gemm_matrix_mul/list.h

@@ -27,8 +27,9 @@
 {
 namespace cpu
 {
-#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name) \
-    void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, float alpha, const bool is_dst_vector)
+#define DECLARE_GEMMMATRIXMUL_KERNEL(func_name)                                                                        \
+    void func_name(const ITensor *lhs, const ITensor *rhs, ITensor *dst, const Window &window, const ThreadInfo &info, \
+                   float alpha, const bool is_dst_vector)
 DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp32_gemm_matrix_mul);
 DECLARE_GEMMMATRIXMUL_KERNEL(neon_fp16_gemm_matrix_mul);
 #undef DECLARE_GEMMMATRIXMUL_KERNEL

diff --git a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
index d4e469b..4ed7e54 100644
--- a/src/cpu/kernels/genproposals/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/fp16.cpp

@@ -27,10 +27,13 @@
 {
 namespace cpu
 {
-void neon_fp16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_fp16_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
 {
     return compute_all_anchors<float16_t>(anchors, all_anchors, anchors_info, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)

diff --git a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
index 09aa6ec..f15cd63 100644
--- a/src/cpu/kernels/genproposals/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/fp32.cpp

@@ -26,9 +26,12 @@
 {
 namespace cpu
 {
-void neon_fp32_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_fp32_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
 {
     return compute_all_anchors<float>(anchors, all_anchors, anchors_info, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.cpp b/src/cpu/kernels/genproposals/generic/neon/impl.cpp
index 9224e32a..8cb76f3 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.cpp

@@ -28,7 +28,10 @@
 class Window;
 namespace cpu
 {
-void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void compute_all_anchors_qasymm16(const ITensor     *anchors,
+                                  ITensor           *all_anchors,
+                                  ComputeAnchorsInfo anchors_info,
+                                  const Window      &window)
 {
     Iterator all_anchors_it(all_anchors, window);
     Iterator anchors_it(all_anchors, window);
@@ -39,28 +42,30 @@
 
     const UniformQuantizationInfo qinfo = anchors->info()->quantization_info().uniform();
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const size_t anchor_offset = id.y() % num_anchors;
 
-        const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+            const auto out_anchor_ptr = reinterpret_cast<int16_t *>(all_anchors_it.ptr());
+            const auto anchor_ptr = reinterpret_cast<int16_t *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
 
-        const size_t shift_idy = id.y() / num_anchors;
-        const float  shiftx    = (shift_idy % feat_width) * stride;
-        const float  shifty    = (shift_idy / feat_width) * stride;
+            const size_t shift_idy = id.y() / num_anchors;
+            const float  shiftx    = (shift_idy % feat_width) * stride;
+            const float  shifty    = (shift_idy / feat_width) * stride;
 
-        const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
-        const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
-        const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
-        const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
+            const float new_anchor_x1 = dequantize_qsymm16(*anchor_ptr, qinfo.scale) + shiftx;
+            const float new_anchor_y1 = dequantize_qsymm16(*(1 + anchor_ptr), qinfo.scale) + shifty;
+            const float new_anchor_x2 = dequantize_qsymm16(*(2 + anchor_ptr), qinfo.scale) + shiftx;
+            const float new_anchor_y2 = dequantize_qsymm16(*(3 + anchor_ptr), qinfo.scale) + shifty;
 
-        *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
-        *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
-        *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
-        *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
-    },
-    all_anchors_it);
+            *out_anchor_ptr       = quantize_qsymm16(new_anchor_x1, qinfo.scale);
+            *(out_anchor_ptr + 1) = quantize_qsymm16(new_anchor_y1, qinfo.scale);
+            *(out_anchor_ptr + 2) = quantize_qsymm16(new_anchor_x2, qinfo.scale);
+            *(out_anchor_ptr + 3) = quantize_qsymm16(new_anchor_y2, qinfo.scale);
+        },
+        all_anchors_it);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/genproposals/generic/neon/impl.h b/src/cpu/kernels/genproposals/generic/neon/impl.h
index da052c9..3317bcf 100644
--- a/src/cpu/kernels/genproposals/generic/neon/impl.h
+++ b/src/cpu/kernels/genproposals/generic/neon/impl.h

@@ -26,13 +26,17 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void compute_all_anchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void compute_all_anchors(const ITensor     *anchors,
+                         ITensor           *all_anchors,
+                         ComputeAnchorsInfo anchors_info,
+                         const Window      &window)
 {
     Iterator all_anchors_it(all_anchors, window);
     Iterator anchors_it(all_anchors, window);
@@ -41,26 +45,31 @@
     const T      stride      = 1.f / anchors_info.spatial_scale();
     const size_t feat_width  = anchors_info.feat_width();
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const size_t anchor_offset = id.y() % num_anchors;
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const size_t anchor_offset = id.y() % num_anchors;
 
-        const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
-        const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
+            const auto out_anchor_ptr = reinterpret_cast<T *>(all_anchors_it.ptr());
+            const auto anchor_ptr     = reinterpret_cast<T *>(anchors->ptr_to_element(Coordinates(0, anchor_offset)));
 
-        const size_t shift_idy = id.y() / num_anchors;
-        const T      shiftx    = (shift_idy % feat_width) * stride;
-        const T      shifty    = (shift_idy / feat_width) * stride;
+            const size_t shift_idy = id.y() / num_anchors;
+            const T      shiftx    = (shift_idy % feat_width) * stride;
+            const T      shifty    = (shift_idy / feat_width) * stride;
 
-        *out_anchor_ptr       = *anchor_ptr + shiftx;
-        *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
-        *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
-        *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
-    },
-    all_anchors_it);
+            *out_anchor_ptr       = *anchor_ptr + shiftx;
+            *(out_anchor_ptr + 1) = *(1 + anchor_ptr) + shifty;
+            *(out_anchor_ptr + 2) = *(2 + anchor_ptr) + shiftx;
+            *(out_anchor_ptr + 3) = *(3 + anchor_ptr) + shifty;
+        },
+        all_anchors_it);
 }
 
-void compute_all_anchors_qasymm16(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window);
+void compute_all_anchors_qasymm16(const ITensor     *anchors,
+                                  ITensor           *all_anchors,
+                                  ComputeAnchorsInfo anchors_info,
+                                  const Window      &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_NEGENERATEPROPOSALSLAYERKERNEL_IMPL_H

diff --git a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
index cfb5a41..7182d0b 100644
--- a/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp
+++ b/src/cpu/kernels/genproposals/generic/neon/qsymm16.cpp

@@ -26,9 +26,12 @@
 {
 namespace cpu
 {
-void neon_qu16_computeallanchors(const ITensor *anchors, ITensor *all_anchors, ComputeAnchorsInfo anchors_info, const Window &window)
+void neon_qu16_computeallanchors(const ITensor     *anchors,
+                                 ITensor           *all_anchors,
+                                 ComputeAnchorsInfo anchors_info,
+                                 const Window      &window)
 {
     return compute_all_anchors_qasymm16(anchors, all_anchors, anchors_info, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
index 2b7d91b..44418c0 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp16.cpp

@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
 
@@ -40,7 +41,10 @@
 }
 
 template <typename InputType, typename AccType>
-InputType vector_float_norm_fp16(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+InputType vector_float_norm_fp16(const InputType &inputs,
+                                 const AccType   &vec_mean,
+                                 const AccType   &vec_multip,
+                                 const AccType   &vec_beta)
 {
     return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
 }
@@ -52,19 +56,24 @@
     vector_float_sum_fp16(result, result_square, wrapper::vcvt<float>(wrapper::vgethigh(inputs)));
 }
 template <>
-inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs, const float32x4_t &vec_mean, const float32x4_t &vec_multip, const float32x4_t &vec_beta)
+inline float16x8_t vector_float_norm_fp16(const float16x8_t &inputs,
+                                          const float32x4_t &vec_mean,
+                                          const float32x4_t &vec_multip,
+                                          const float32x4_t &vec_beta)
 {
-    const auto  input_low   = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
-    const auto  input_high  = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
-    const auto  result_low  = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta));
-    const auto  result_high = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta));
-    float16x8_t result      = wrapper::vcombine(result_low, result_high);
+    const auto input_low  = wrapper::vcvt<float>(wrapper::vgetlow(inputs));
+    const auto input_high = wrapper::vcvt<float>(wrapper::vgethigh(inputs));
+    const auto result_low = wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_low, vec_mean, vec_multip, vec_beta));
+    const auto result_high =
+        wrapper::vcvt<float16_t>(vector_float_norm_fp16(input_high, vec_mean, vec_multip, vec_beta));
+    float16x8_t result = wrapper::vcombine(result_low, result_high);
 
     return result;
 }
 
 template <typename AccType>
-void instance_normalization_nchw_fp16(const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+void instance_normalization_nchw_fp16(
+    const ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float16_t, wrapper::traits::BitWidth::W128>;
@@ -78,91 +87,105 @@
     const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
 
     Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w         = static_cast<AccType>(0.f);
-        auto sum_squares_h_w = static_cast<AccType>(0.f);
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+            Window win_plane = window;
+            win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+            win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+            win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
 
-            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+            Iterator input_plane_it(input, win_plane);
+            Iterator output_plane_it(output, win_plane);
 
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
-            }
+            auto sum_h_w         = static_cast<AccType>(0.f);
+            auto sum_squares_h_w = static_cast<AccType>(0.f);
 
-            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    const auto input_ptr = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
 
-            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+                    auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+                    auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
 
-            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                        vector_float_sum_fp16(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+                    }
 
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto value = static_cast<AccType>(*(input_ptr + x));
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-            }
+                    auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+                    auto vec2_sum_squares_h_w =
+                        wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+                    vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                    vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+                    sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+                    sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto value = static_cast<AccType>(*(input_ptr + x));
+                        sum_h_w += value;
+                        sum_squares_h_w += value * value;
+                    }
+                },
+                input_plane_it, output_plane_it);
+
+            const auto mean_h_w = sum_h_w / elements_plane;
+            const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+            const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+            const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+            const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+            const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    auto input_ptr  = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
+                    auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr());
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        const auto vec_val = wrapper::vloadq(input_ptr + x);
+                        const auto normalized_vec =
+                            vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                        wrapper::vstore(output_ptr + x, normalized_vec);
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto val    = static_cast<AccType>(*(input_ptr + x));
+                        *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta);
+                    }
+                },
+                input_plane_it, output_plane_it);
         },
-        input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
-        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
-        {
-            auto input_ptr  = reinterpret_cast<const float16_t *>(input_plane_it.ptr());
-            auto output_ptr = reinterpret_cast<float16_t *>(output_plane_it.ptr());
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                const auto vec_val        = wrapper::vloadq(input_ptr + x);
-                const auto normalized_vec = vector_float_norm_fp16(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
-                wrapper::vstore(output_ptr + x, normalized_vec);
-            }
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto val    = static_cast<AccType>(*(input_ptr + x));
-                *(output_ptr + x) = static_cast<float16_t>((val - mean_h_w) * multip_h_w + beta);
-            }
-        },
-        input_plane_it, output_plane_it);
-    },
-    input_it);
+        input_it);
 }
-}
+} // namespace
 
-void neon_fp16_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+void neon_fp16_instancenorm(ITensor      *input,
+                            ITensor      *output,
+                            float         gamma,
+                            float         beta,
+                            float         epsilon,
+                            bool          use_mixed_precision,
+                            const Window &window)
 {
-    if(use_mixed_precision)
+    if (use_mixed_precision)
     {
         return instance_normalization_nchw_fp16<float>(input, output, gamma, beta, epsilon, window);
     }

diff --git a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
index 061dd95..e1ca055 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/fp32.cpp

@@ -26,7 +26,13 @@
 {
 namespace cpu
 {
-void neon_fp32_instancenorm(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+void neon_fp32_instancenorm(ITensor      *input,
+                            ITensor      *output,
+                            float         gamma,
+                            float         beta,
+                            float         epsilon,
+                            bool          use_mixed_precision,
+                            const Window &window)
 {
     ARM_COMPUTE_UNUSED(use_mixed_precision);
     return instance_normalization_nchw<float>(input, output, gamma, beta, epsilon, window);

diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
index 483b6f5..515079e 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/impl.cpp
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/instancenorm/generic/neon/impl.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -38,13 +39,15 @@
 }
 
 template <typename InputType, typename AccType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta)
 {
     return wrapper::vadd(wrapper::vmul(wrapper::vsub(inputs, vec_mean), vec_multip), vec_beta);
 }
 
 template <typename T, typename AccType>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
+void instance_normalization_nchw(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
@@ -58,88 +61,96 @@
     const unsigned int elements_plane = input->info()->dimension(0) * output->info()->dimension(1);
 
     Iterator input_it(input, win);
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        Window win_plane = window;
-        win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
-        win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
-        win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
-
-        Iterator input_plane_it(input, win_plane);
-        Iterator output_plane_it(output, win_plane);
-
-        auto sum_h_w         = static_cast<AccType>(0.f);
-        auto sum_squares_h_w = static_cast<AccType>(0.f);
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
+            Window win_plane = window;
+            win_plane.set(Window::DimX, Window::Dimension(0, 1, 1));
+            win_plane.set(Window::DimZ, Window::Dimension(id[2], id[2] + 1, 1));
+            win_plane.set(3, Window::Dimension(id[3], id[3] + 1, 1));
 
-            auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
-            auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+            Iterator input_plane_it(input, win_plane);
+            Iterator output_plane_it(output, win_plane);
 
-            // Compute S elements per iteration
-            int x = window.x().start();
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                auto vec_input_val = wrapper::vloadq(input_ptr + x);
-                vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
-            }
+            auto sum_h_w         = static_cast<AccType>(0.f);
+            auto sum_squares_h_w = static_cast<AccType>(0.f);
 
-            auto vec2_sum_h_w         = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
-            auto vec2_sum_squares_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    const auto input_ptr = reinterpret_cast<const T *>(input_plane_it.ptr());
 
-            vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
-            vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+                    auto vec_sum_h_w         = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
+                    auto vec_sum_squares_h_w = wrapper::vdup_n(static_cast<AccType>(0.f), ExactTagType{});
 
-            sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
-            sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        auto vec_input_val = wrapper::vloadq(input_ptr + x);
+                        vector_float_sum(vec_sum_h_w, vec_sum_squares_h_w, vec_input_val);
+                    }
 
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto value = static_cast<AccType>(*(input_ptr + x));
-                sum_h_w += value;
-                sum_squares_h_w += value * value;
-            }
+                    auto vec2_sum_h_w = wrapper::vpadd(wrapper::vgethigh(vec_sum_h_w), wrapper::vgetlow(vec_sum_h_w));
+                    auto vec2_sum_squares_h_w =
+                        wrapper::vpadd(wrapper::vgethigh(vec_sum_squares_h_w), wrapper::vgetlow(vec_sum_squares_h_w));
+
+                    vec2_sum_h_w         = wrapper::vpadd(vec2_sum_h_w, vec2_sum_h_w);
+                    vec2_sum_squares_h_w = wrapper::vpadd(vec2_sum_squares_h_w, vec2_sum_squares_h_w);
+
+                    sum_h_w += wrapper::vgetlane(vec2_sum_h_w, 0);
+                    sum_squares_h_w += wrapper::vgetlane(vec2_sum_squares_h_w, 0);
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto value = static_cast<AccType>(*(input_ptr + x));
+                        sum_h_w += value;
+                        sum_squares_h_w += value * value;
+                    }
+                },
+                input_plane_it, output_plane_it);
+
+            const auto mean_h_w = sum_h_w / elements_plane;
+            const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
+
+            const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
+            const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
+            const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
+            const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
+
+            execute_window_loop(
+                win_plane,
+                [&](const Coordinates &)
+                {
+                    auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
+                    auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
+
+                    // Compute S elements per iteration
+                    int x = window.x().start();
+                    //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
+                    for (; x <= (window.x().end() - window_step_x); x += window_step_x)
+                    {
+                        const auto vec_val        = wrapper::vloadq(input_ptr + x);
+                        const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
+                        wrapper::vstore(output_ptr + x, normalized_vec);
+                    }
+
+                    // Compute left-over elements
+                    for (; x < window.x().end(); ++x)
+                    {
+                        const auto val    = static_cast<AccType>(*(input_ptr + x));
+                        *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
+                    }
+                },
+                input_plane_it, output_plane_it);
         },
-        input_plane_it, output_plane_it);
-
-        const auto mean_h_w = sum_h_w / elements_plane;
-        const auto var_h_w  = sum_squares_h_w / elements_plane - mean_h_w * mean_h_w;
-
-        const auto multip_h_w     = gamma / std::sqrt(var_h_w + epsilon);
-        const auto vec_mean_h_w   = wrapper::vdup_n(static_cast<AccType>(mean_h_w), ExactTagType{});
-        const auto vec_multip_h_w = wrapper::vdup_n(static_cast<AccType>(multip_h_w), ExactTagType{});
-        const auto vec_beta       = wrapper::vdup_n(static_cast<AccType>(beta), ExactTagType{});
-
-        execute_window_loop(win_plane, [&](const Coordinates &)
-        {
-            auto input_ptr  = reinterpret_cast<T *>(input_plane_it.ptr());
-            auto output_ptr = reinterpret_cast<T *>(output_plane_it.ptr());
-
-            // Compute S elements per iteration
-            int x = window.x().start();
-            //auto vec_val = wrapper::vdup_n(static_cast<T>(0.0f), ExactTagType{});
-            for(; x <= (window.x().end() - window_step_x); x += window_step_x)
-            {
-                const auto vec_val        = wrapper::vloadq(input_ptr + x);
-                const auto normalized_vec = vector_float_norm(vec_val, vec_mean_h_w, vec_multip_h_w, vec_beta);
-                wrapper::vstore(output_ptr + x, normalized_vec);
-            }
-
-            // Compute left-over elements
-            for(; x < window.x().end(); ++x)
-            {
-                const auto val    = static_cast<AccType>(*(input_ptr + x));
-                *(output_ptr + x) = static_cast<T>((val - mean_h_w) * multip_h_w + beta);
-            }
-        },
-        input_plane_it, output_plane_it);
-    },
-    input_it);
+        input_it);
 }
 
-template void instance_normalization_nchw<float>(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+template void instance_normalization_nchw<float>(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/instancenorm/generic/neon/impl.h b/src/cpu/kernels/instancenorm/generic/neon/impl.h
index 0ddfcdd..e1cc748 100644
--- a/src/cpu/kernels/instancenorm/generic/neon/impl.h
+++ b/src/cpu/kernels/instancenorm/generic/neon/impl.h

@@ -32,13 +32,15 @@
 namespace cpu
 {
 template <typename T, typename AccType = T>
-void instance_normalization_nchw(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
+void instance_normalization_nchw(
+    ITensor *input, ITensor *output, float gamma, float beta, float epsilon, const Window &window);
 
 template <typename InputType, typename AccType = InputType>
 void vector_float_sum(AccType &result, AccType &result_square, const InputType &inputs);
 
 template <typename InputType, typename AccType = InputType>
-InputType vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
+InputType
+vector_float_norm(const InputType &inputs, const AccType &vec_mean, const AccType &vec_multip, const AccType &vec_beta);
 } // namespace cpu
 } // namespace arm_compute
 #endif //define SRC_CORE_SVE_KERNELS_INSTANCENORM_IMPL_H

diff --git a/src/cpu/kernels/instancenorm/list.h b/src/cpu/kernels/instancenorm/list.h
index 54f1d32..51b496c 100644
--- a/src/cpu/kernels/instancenorm/list.h
+++ b/src/cpu/kernels/instancenorm/list.h

@@ -27,8 +27,9 @@
 {
 namespace cpu
 {
-#define DECLARE_INSTANCENORM_KERNEL(func_name) \
-    void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, const Window &window)
+#define DECLARE_INSTANCENORM_KERNEL(func_name)                                                                        \
+    void func_name(ITensor *input, ITensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision, \
+                   const Window &window)
 DECLARE_INSTANCENORM_KERNEL(neon_fp32_instancenorm);
 DECLARE_INSTANCENORM_KERNEL(neon_fp16_instancenorm);
 #undef DECLARE_INSTANCENORM_KERNEL

diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
index b503a8b..32d9ca4 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.cpp

@@ -24,18 +24,17 @@
 #include "src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/assembly/depthwise.hpp"
 #include "src/core/utils/AssemblyUtils.h"
 
-#include "src/core/NEON/kernels/assembly/depthwise.hpp"
-
 #include "depthwise_common.hpp"
-
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -54,9 +53,13 @@
 constexpr unsigned int idx_batches  = 3;
 
 template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
-                    const ConvolutionInfo &info, const CPUInfo &cpu_info,
-                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel, std::string &_name)
+void create_arm_dwc(const ITensorInfo                                      *src,
+                    const ITensorInfo                                      *weights,
+                    ITensorInfo                                            *dst,
+                    const ConvolutionInfo                                  &info,
+                    const CPUInfo                                          &cpu_info,
+                    std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
+                    std::string                                            &_name)
 {
     unsigned int stride_cols{};
     unsigned int stride_rows{};
@@ -79,13 +82,13 @@
 
     const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
 
-    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols,
-                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
-                                            padding, activation, nullptr);
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+                                            dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
 
     // Configure assembly pooling kernel
     auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst>(args);
-    if(dwc_kernel_asm == nullptr)
+    if (dwc_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
@@ -96,11 +99,16 @@
 }
 
 template <typename TSrc, typename TWeights, typename TDst>
-void create_arm_dwc_quant(const ITensorInfo *src, const ITensorInfo *weights, ITensorInfo *dst,
-                          const ConvolutionInfo &info, const CPUInfo &cpu_info,
+void create_arm_dwc_quant(const ITensorInfo                                      *src,
+                          const ITensorInfo                                      *weights,
+                          ITensorInfo                                            *dst,
+                          const ConvolutionInfo                                  &info,
+                          const CPUInfo                                          &cpu_info,
                           std::unique_ptr<arm_conv::depthwise::IDepthwiseCommon> &kernel,
-                          std::vector<int32_t> &multipliers, std::vector<int32_t> &right_shifts, std::vector<int32_t> &left_shifts,
-                          std::string &_name)
+                          std::vector<int32_t>                                   &multipliers,
+                          std::vector<int32_t>                                   &right_shifts,
+                          std::vector<int32_t>                                   &left_shifts,
+                          std::string                                            &_name)
 {
     unsigned int stride_cols{};
     unsigned int stride_rows{};
@@ -123,9 +131,9 @@
 
     const arm_gemm::Activation activation = assembly_utils::map_to_arm_gemm_activation(info.act_info);
 
-    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols, dilation_rows, dilation_cols,
-                                            n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, info.depth_multiplier,
-                                            padding, activation, nullptr);
+    arm_conv::depthwise::DepthwiseArgs args(&cpu_info, kernel_rows, kernel_cols, stride_rows, stride_cols,
+                                            dilation_rows, dilation_cols, n_batches, src_rows, src_cols, n_channels,
+                                            dst_rows, dst_cols, info.depth_multiplier, padding, activation, nullptr);
 
     const auto src_qinfo     = src->quantization_info().uniform();
     const auto weights_qinfo = weights->quantization_info();
@@ -135,64 +143,50 @@
 
     multipliers.resize(num_filters);
     std::vector<int32_t> dst_shifts(num_filters);
-    quantization::compute_quantized_multipliers_and_shifts(src,
-                                                           weights,
-                                                           dst,
-                                                           multipliers.data(),
-                                                           dst_shifts.data());
+    quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, multipliers.data(), dst_shifts.data());
 
     // Quantize activation bounds
     int32_t min_activation = std::numeric_limits<TSrc>::lowest();
     int32_t max_activation = std::numeric_limits<TSrc>::max();
-    if(info.act_info.enabled())
+    if (info.act_info.enabled())
     {
-        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
+        std::tie(min_activation, max_activation) =
+            get_quantized_activation_min_max(info.act_info, src->data_type(), dst_qinfo);
     }
 
     // Set quantization parameters for assembly kernels
     arm_gemm::Requantize32 requant_args{};
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         left_shifts.resize(num_filters);
         right_shifts.resize(num_filters);
         bool need_left_shift = false; // Select more optimized path if left shift is not needed
-        for(unsigned int i = 0; i < num_filters; ++i)
+        for (unsigned int i = 0; i < num_filters; ++i)
         {
             left_shifts[i]  = std::max(-dst_shifts[i], static_cast<int32_t>(0));
             right_shifts[i] = std::min(-dst_shifts[i], static_cast<int32_t>(0));
-            if(dst_shifts[i] < 0 && !need_left_shift)
+            if (dst_shifts[i] < 0 && !need_left_shift)
             {
                 need_left_shift = true;
             }
         }
 
-        requant_args = arm_gemm::Requantize32(nullptr,
-                                              0,
-                                              src_qinfo.offset,
-                                              weights_qinfo.uniform().offset,
-                                              dst_qinfo.offset,
-                                              (need_left_shift) ? left_shifts.data() : nullptr,
-                                              right_shifts.data(),
-                                              multipliers.data(),
-                                              static_cast<TSrc>(min_activation),
-                                              static_cast<TSrc>(max_activation));
+        requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset, (need_left_shift) ? left_shifts.data() : nullptr,
+                                              right_shifts.data(), multipliers.data(),
+                                              static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
     }
     else
     {
-        requant_args = arm_gemm::Requantize32(nullptr,
-                                              0,
-                                              src_qinfo.offset,
-                                              weights_qinfo.uniform().offset,
-                                              dst_qinfo.offset,
-                                              -dst_shifts[0],
-                                              multipliers[0],
-                                              static_cast<TSrc>(min_activation),
-                                              static_cast<TSrc>(max_activation));
+        requant_args = arm_gemm::Requantize32(nullptr, 0, src_qinfo.offset, weights_qinfo.uniform().offset,
+                                              dst_qinfo.offset, -dst_shifts[0], multipliers[0],
+                                              static_cast<TSrc>(min_activation), static_cast<TSrc>(max_activation));
     }
 
     // Configure assembly pooling kernel with requantization
-    auto dwc_kernel_asm = arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
-    if(dwc_kernel_asm == nullptr)
+    auto dwc_kernel_asm =
+        arm_conv::depthwise::depthwise<TSrc, TWeights, TDst, arm_gemm::Requantize32>(args, requant_args);
+    if (dwc_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
@@ -203,18 +197,18 @@
 } // namespace
 
 CpuDepthwiseConv2dAssemblyWrapperKernel::CpuDepthwiseConv2dAssemblyWrapperKernel()
-    : _kernel_asm(nullptr),
-      _multipliers(),
-      _left_shifts(),
-      _right_shifts(),
-      _name()
+    : _kernel_asm(nullptr), _multipliers(), _left_shifts(), _right_shifts(), _name()
 {
 }
 
 CpuDepthwiseConv2dAssemblyWrapperKernel::~CpuDepthwiseConv2dAssemblyWrapperKernel() = default;
 
-void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *, ITensorInfo *dst,
-                                                        const ConvolutionInfo &info, const CPUInfo &cpu_info)
+void CpuDepthwiseConv2dAssemblyWrapperKernel::configure(const ITensorInfo *src,
+                                                        const ITensorInfo *weights,
+                                                        const ITensorInfo *,
+                                                        ITensorInfo           *dst,
+                                                        const ConvolutionInfo &info,
+                                                        const CPUInfo         &cpu_info)
 {
     ARM_COMPUTE_UNUSED(cpu_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -225,24 +219,30 @@
     _name = "CpuDepthwiseConv2dAssemblyWrapperKernel";
     std::string asm_kernel_name("");
 #if defined(__aarch64__)
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
-            if(is_data_type_quantized_per_channel(weights->data_type()))
+            if (is_data_type_quantized_per_channel(weights->data_type()))
             {
-                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+                create_arm_dwc_quant<uint8_t, int8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                               _multipliers, _right_shifts, _left_shifts,
+                                                               asm_kernel_name);
             }
             else
             {
-                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+                create_arm_dwc_quant<uint8_t, uint8_t, uint8_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                                _multipliers, _right_shifts, _left_shifts,
+                                                                asm_kernel_name);
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers, _right_shifts, _left_shifts, asm_kernel_name);
+            create_arm_dwc_quant<int8_t, int8_t, int8_t>(src, weights, dst, info, cpu_info, _kernel_asm, _multipliers,
+                                                         _right_shifts, _left_shifts, asm_kernel_name);
             break;
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         case DataType::F16:
-            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm, asm_kernel_name);
+            create_arm_dwc<float16_t, float16_t, float16_t>(src, weights, dst, info, cpu_info, _kernel_asm,
+                                                            asm_kernel_name);
             break;
 #endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
         case DataType::F32:
@@ -255,13 +255,17 @@
 
     Window win = calculate_max_window(*dst, Steps());
     ICpuKernel::configure(win);
-    if(_kernel_asm != nullptr)
+    if (_kernel_asm != nullptr)
     {
         _name += "/" + asm_kernel_name;
     }
 }
 
-Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyWrapperKernel::validate(const ITensorInfo     *src,
+                                                         const ITensorInfo     *weights,
+                                                         const ITensorInfo     *bias,
+                                                         const ITensorInfo     *dst,
+                                                         const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -269,10 +273,12 @@
     ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
 #endif // !defined(__aarch64__)
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC,
+                                    "Only NHWC is supported by assembly kernels");
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(0) != weights->quantization_info().scale().size());
@@ -282,12 +288,12 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(0));
 
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -297,7 +303,7 @@
         }
     }
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -305,17 +311,15 @@
     }
 
     // Assembly kernels cannot work with padding greater than the kernel.
-    const auto &padding = info.pad_stride_info;
-    const auto &dilation = info.dilation;
+    const auto &padding   = info.pad_stride_info;
+    const auto &dilation  = info.dilation;
     const auto &wei_shape = weights->tensor_shape();
 
     const auto dilated_wei_w = wei_shape[1] + (wei_shape[1] - 1) * (dilation.x() - 1);
     const auto dilated_wei_h = wei_shape[2] + (wei_shape[2] - 1) * (dilation.y() - 1);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(
-        padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
-        padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h
-    );
+    ARM_COMPUTE_RETURN_ERROR_ON(padding.pad_left() >= dilated_wei_w || padding.pad_right() >= dilated_wei_w ||
+                                padding.pad_top() >= dilated_wei_h || padding.pad_bottom() >= dilated_wei_h);
 
     return Status{};
 }
@@ -351,13 +355,12 @@
     const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
     const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
 
-    _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         parameters_ptr,
-                         dst_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
-                         working_space, info.thread_id, info.num_threads);
+    _kernel_asm->execute(src_ptr, ld_src_col, ld_src_row, ld_src_batch, parameters_ptr, dst_ptr, ld_dst_col, ld_dst_row,
+                         ld_dst_batch, working_space, info.thread_id, info.num_threads);
 }
 
-void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
+void CpuDepthwiseConv2dAssemblyWrapperKernel::pack_parameters(
+    void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weight_row)
 {
     _kernel_asm->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weight_row);
 }

diff --git a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
index f61cb1b..fadaefb 100644
--- a/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuDepthwiseConv2dAssemblyWrapperKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_WRAPPER_KERNEL_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/kernels/CpuKernelSelectionTypes.h"
@@ -35,8 +36,8 @@
 {
 // Forward declarations
 class IDepthwiseCommon;
-} // depthwise
-} // arm_conv
+} // namespace depthwise
+} // namespace arm_conv
 
 namespace arm_compute
 {
@@ -66,7 +67,12 @@
      * @param[in]  info     Depthwise convolution layer meta-data.
      * @param[in]  cpu_info CPU information needed to select the most appropriate kernel.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info, const CPUInfo &cpu_info);
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *bias,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info,
+                   const CPUInfo         &cpu_info);
 
     /** Indicates whether or not this function can be used to process the given parameters.
      *
@@ -74,10 +80,14 @@
      *
      * @return a status.
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *bias,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    void        run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
     const char *name() const override;
 
     /** Pack bias and weights in a storage space for the assembly kernel
@@ -88,7 +98,8 @@
      * @param[in] ld_weights_col Columns displacement for the weights tensor.
      * @param[in] ld_weights_row Rows displacement for the weights tensor.
      */
-    void pack_parameters(void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
+    void pack_parameters(
+        void *parameters_ptr, void *bias_ptr, void *weights_ptr, size_t ld_weights_col, size_t ld_weights_row);
 
     /** Get the amount of storage space required for the rearranged weights and bias.
      *

diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
index 10ff418..a161c80 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.cpp

@@ -22,14 +22,16 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
+
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/INEKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/INEKernel.h"
 
 #include <arm_neon.h>
 
@@ -41,7 +43,10 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::configure(const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const PoolingLayerInfo &info,
+                                               const CPUInfo          &cpu_info)
 {
     ARM_COMPUTE_UNUSED(cpu_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -52,10 +57,10 @@
 #if defined(__aarch64__)
     const bool requantize = src->quantization_info() != dst->quantization_info();
 
-    switch(src->data_type())
+    switch (src->data_type())
     {
         case DataType::QASYMM8:
-            if(requantize)
+            if (requantize)
             {
                 create_arm_pooling_requant<uint8_t, uint8_t>(src, dst, info, cpu_info);
             }
@@ -65,7 +70,7 @@
             }
             break;
         case DataType::QASYMM8_SIGNED:
-            if(requantize)
+            if (requantize)
             {
                 create_arm_pooling_requant<int8_t, int8_t>(src, dst, info, cpu_info);
             }
@@ -91,7 +96,8 @@
     INEKernel::configure(win);
 }
 
-Status CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
+Status
+CpuPool2dAssemblyWrapperKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -99,43 +105,52 @@
     ARM_COMPUTE_RETURN_ERROR_MSG("32-bit is not supported by assembly kernels");
 #endif /* __aarch64__ */
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC), "Only NHWC is supported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((src->data_layout() != DataLayout::NHWC) || (info.data_layout != DataLayout::NHWC),
+                                    "Only NHWC is supported by assembly kernels");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.pool_type != PoolingType::AVG) && (info.pool_type != PoolingType::MAX),
                                     "Only AVG and MAX pooling are supported by assembly kernels");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(info), "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_pool_region_entirely_outside_input(info),
+        "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
         const auto src_qinfo = src->quantization_info().uniform();
         const auto dst_qinfo = dst->quantization_info().uniform();
 
-        if(src_qinfo != dst_qinfo)
+        if (src_qinfo != dst_qinfo)
         {
             const float multiplier = src_qinfo.scale / dst_qinfo.scale;
             int32_t     dst_multiplier{};
             int32_t     dst_shift{};
-            ARM_COMPUTE_RETURN_ERROR_ON(quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
+            ARM_COMPUTE_RETURN_ERROR_ON(
+                quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift));
         }
         else
         {
-            if(src->data_type() == DataType::QASYMM8)
+            if (src->data_type() == DataType::QASYMM8)
             {
                 const bool has_padding = info.pad_stride_info.has_padding();
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !info.exclude_padding && has_padding,
+                    "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
             }
         }
     }
     else
     {
-        if(src->data_type() == DataType::QASYMM8)
+        if (src->data_type() == DataType::QASYMM8)
         {
             // If dst is not configured, the quantization info are the same
             const bool has_padding = info.pad_stride_info.has_padding();
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!info.exclude_padding && has_padding, "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !info.exclude_padding && has_padding,
+                "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
         }
     }
     return Status{};
@@ -154,9 +169,10 @@
     ITensor       *dst       = tensors.get_tensor(TensorType::ACL_DST);
     ITensor       *workspace = tensors.get_tensor(TensorType::ACL_INT_0);
 
-    const auto in_ptr        = src->buffer() + src->info()->offset_first_element_in_bytes();
-    auto       out_ptr       = dst->buffer() + dst->info()->offset_first_element_in_bytes();
-    auto       working_space = (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
+    const auto in_ptr  = src->buffer() + src->info()->offset_first_element_in_bytes();
+    auto       out_ptr = dst->buffer() + dst->info()->offset_first_element_in_bytes();
+    auto       working_space =
+        (workspace == nullptr) ? nullptr : workspace->buffer() + workspace->info()->offset_first_element_in_bytes();
 
     const auto src_shape   = src->info()->tensor_shape();
     const auto dst_shape   = dst->info()->tensor_shape();
@@ -170,8 +186,7 @@
     const size_t ld_dst_row   = ld_dst_col * (dst_shape[1] + dst_padding.top + dst_padding.bottom);
     const size_t ld_dst_batch = ld_dst_row * dst_shape[2];
 
-    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch,
-                         out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
+    _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
                          working_space, info.thread_id, info.num_threads);
 }
 
@@ -186,9 +201,14 @@
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(const ITensorInfo      *src,
+                                                        ITensorInfo            *dst,
+                                                        const PoolingLayerInfo &info,
+                                                        const CPUInfo          &cpu_info)
 {
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+                                                         ? arm_conv::pooling::PoolingType::AVERAGE
+                                                         : arm_conv::pooling::PoolingType::MAX;
 
     arm_conv::pooling::PoolingWindow window{};
     window.cols = static_cast<unsigned int>(info.pool_size.x());
@@ -197,7 +217,8 @@
     arm_conv::pooling::PoolingStride stride{};
     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
 
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+    const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+                                                   info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
 
     constexpr unsigned int idx_width    = 1;
     constexpr unsigned int idx_height   = 2;
@@ -211,11 +232,12 @@
     const unsigned int dst_rows   = dst->dimension(idx_height);
     const unsigned int dst_cols   = dst->dimension(idx_width);
 
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+                                        src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
 
     // Configure assembly pooling kernel
     auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(args);
-    if(pooling_kernel_asm == nullptr)
+    if (pooling_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;
@@ -225,9 +247,14 @@
 }
 
 template <typename Typesrc, typename Typedst>
-void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info)
+void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(const ITensorInfo      *src,
+                                                                ITensorInfo            *dst,
+                                                                const PoolingLayerInfo &info,
+                                                                const CPUInfo          &cpu_info)
 {
-    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG) ? arm_conv::pooling::PoolingType::AVERAGE : arm_conv::pooling::PoolingType::MAX;
+    const arm_conv::pooling::PoolingType pool_type = (info.pool_type == PoolingType::AVG)
+                                                         ? arm_conv::pooling::PoolingType::AVERAGE
+                                                         : arm_conv::pooling::PoolingType::MAX;
 
     arm_conv::pooling::PoolingWindow window{};
     window.cols = static_cast<unsigned int>(info.pool_size.x());
@@ -236,7 +263,8 @@
     arm_conv::pooling::PoolingStride stride{};
     std::tie(stride.cols, stride.rows) = info.pad_stride_info.stride();
 
-    const arm_conv::pooling::PaddingValues padding{ info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(), info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom() };
+    const arm_conv::pooling::PaddingValues padding{info.pad_stride_info.pad_left(), info.pad_stride_info.pad_top(),
+                                                   info.pad_stride_info.pad_right(), info.pad_stride_info.pad_bottom()};
 
     constexpr unsigned int idx_width    = 1;
     constexpr unsigned int idx_height   = 2;
@@ -250,7 +278,8 @@
     const unsigned int dst_rows   = dst->dimension(idx_height);
     const unsigned int dst_cols   = dst->dimension(idx_width);
 
-    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows, src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
+    arm_conv::pooling::PoolingArgs args(&cpu_info, pool_type, window, stride, info.exclude_padding, n_batches, src_rows,
+                                        src_cols, n_channels, dst_rows, dst_cols, padding, nullptr);
 
     const auto src_qinfo = src->quantization_info().uniform();
     const auto dst_qinfo = dst->quantization_info().uniform();
@@ -260,15 +289,15 @@
     int32_t     dst_shift{};
     quantization::calculate_quantized_multiplier(multiplier, &dst_multiplier, &dst_shift);
 
-    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset,
-                                                       dst_qinfo.offset,
+    const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, dst_qinfo.offset,
                                                        dst_shift, // left shift
                                                        0,         // right shift
                                                        dst_multiplier);
 
     // Configure assembly pooling kernel with requantization
-    auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
-    if(pooling_kernel_asm == nullptr)
+    auto pooling_kernel_asm =
+        arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(args, requant_args);
+    if (pooling_kernel_asm == nullptr)
     {
         // Configuration not supported: Leave function unconfigured:
         return;

diff --git a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
index 8713d5c..b4ff1e6 100644
--- a/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h
+++ b/src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h

@@ -25,8 +25,9 @@
 #define ARM_COMPUTE_CPU_POOL2D_ASSEMBLY_WRAPPER_KERNEL_H
 
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/assembly/pooling.hpp"
+
 #include "src/core/common/Macros.h"
+#include "src/core/NEON/kernels/assembly/pooling.hpp"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/kernels/CpuKernelSelectionTypes.h"
 
@@ -101,7 +102,8 @@
      * @param[in] info Pooling layer meta-data.
      */
     template <typename Typesrc, typename Typedst>
-    void create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+    void
+    create_arm_pooling(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
 
     /** Helper function to create the assembly kernel with requantization support
      *
@@ -110,9 +112,12 @@
      * @param[in] info Pooling layer meta-data.
      */
     template <typename Typesrc, typename Typedst>
-    void create_arm_pooling_requant(const ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, const CPUInfo &cpu_info);
+    void create_arm_pooling_requant(const ITensorInfo      *src,
+                                    ITensorInfo            *dst,
+                                    const PoolingLayerInfo &info,
+                                    const CPUInfo          &cpu_info);
 
-    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{ nullptr };
+    std::unique_ptr<arm_conv::pooling::IPoolingCommon> _kernel_asm{nullptr};
 
     /** Return minimum workload size of the relevant kernel
      *

diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
index 661c3d7..6c6527d 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp16.cpp

@@ -32,13 +32,15 @@
 {
 namespace cpu
 {
-void neon_fp16_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp16_l2_normalize_x(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
 {
     ARM_COMPUTE_UNUSED(unused_axis);
     return l2_normalize_x<float16_t, 8>(in, sum, out, epsilon, window);
 }
 
-void neon_fp16_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp16_l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 {
     return l2_normalize_yz<float16_t, 8>(in, sum, out, epsilon, window, axis);
 }

diff --git a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
index be32bdc..5208770 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/l2normlayer/generic/neon/fp32.cpp

@@ -22,21 +22,23 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
 
+#include "src/cpu/kernels/l2normlayer/generic/neon/impl.h"
+
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_l2_normalize_x(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
+void neon_fp32_l2_normalize_x(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t unused_axis)
 {
     ARM_COMPUTE_UNUSED(unused_axis);
     return l2_normalize_x<float, 4>(in, sum, out, epsilon, window);
 }
 
-void neon_fp32_l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void neon_fp32_l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 {
     return l2_normalize_yz<float, 4>(in, sum, out, epsilon, window, axis);
 }

diff --git a/src/cpu/kernels/l2normlayer/generic/neon/impl.h b/src/cpu/kernels/l2normlayer/generic/neon/impl.h
index a06cdd3..6bd1929 100644
--- a/src/cpu/kernels/l2normlayer/generic/neon/impl.h
+++ b/src/cpu/kernels/l2normlayer/generic/neon/impl.h

@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 #include <cstddef>
 
@@ -51,33 +52,36 @@
     Iterator sum_it(sum, win_collapsed);
     Iterator output_it(out, win_collapsed);
 
-    execute_window_loop(win_collapsed, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
-        const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
-        const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win_collapsed,
+        [&](const Coordinates &)
         {
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
-        }
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            out_ptr[x] = in_ptr[x] * norm_value;
-        }
-    },
-    input_it, sum_it, output_it);
+            const T    sum_value      = *reinterpret_cast<const T *>(sum_it.ptr());
+            const T    norm_value     = static_cast<T>(1.f) / std::sqrt(std::max(sum_value, static_cast<T>(epsilon)));
+            const auto vec_norm_value = wrapper::vdup_n(norm_value, ExactTagType{});
+
+            // Compute elements over vector steps
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                out_ptr[x] = in_ptr[x] * norm_value;
+            }
+        },
+        input_it, sum_it, output_it);
 }
 
 template <typename T, int S>
-void l2_normalize_yz(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+void l2_normalize_yz(
+    const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
 {
     using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type;
 
@@ -97,28 +101,30 @@
 
     const auto vec_eps = wrapper::vdup_n(static_cast<T>(epsilon), ExactTagType{});
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
-        const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-
-        // Compute elements over vector steps
-        int x = window_start_x;
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
-            wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
-        }
+            const auto in_ptr  = reinterpret_cast<const T *>(input_it.ptr());
+            const auto sum_ptr = reinterpret_cast<const T *>(sum_it.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
-            out_ptr[x]         = in_ptr[x] * norm_value;
-        }
-    },
-    input_it, sum_it, output_it);
+            // Compute elements over vector steps
+            int x = window_start_x;
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto vec_norm_value = wrapper::vinvsqrt(wrapper::vmax(wrapper::vloadq(sum_ptr + x), vec_eps));
+                wrapper::vstore(out_ptr + x, wrapper::vmul(wrapper::vloadq(in_ptr + x), vec_norm_value));
+            }
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const T norm_value = static_cast<T>(1.f) / std::sqrt(std::max(sum_ptr[x], static_cast<T>(epsilon)));
+                out_ptr[x]         = in_ptr[x] * norm_value;
+            }
+        },
+        input_it, sum_it, output_it);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/l2normlayer/list.h b/src/cpu/kernels/l2normlayer/list.h
index 2bad7f5..e2a879d 100644
--- a/src/cpu/kernels/l2normlayer/list.h
+++ b/src/cpu/kernels/l2normlayer/list.h

@@ -27,8 +27,9 @@
 {
 namespace cpu
 {
-#define DECLARE_L2NORMLAYER_KERNEL(func_name) \
-    void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, size_t axis)
+#define DECLARE_L2NORMLAYER_KERNEL(func_name)                                                                \
+    void func_name(const ITensor *in, const ITensor *sum, ITensor *out, float epsilon, const Window &window, \
+                   size_t axis)
 
 DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_x);
 DECLARE_L2NORMLAYER_KERNEL(neon_fp16_l2_normalize_yz);

diff --git a/src/cpu/kernels/lut/generic/neon/u8.cpp b/src/cpu/kernels/lut/generic/neon/u8.cpp
index 8ab647b..5516f5b 100644
--- a/src/cpu/kernels/lut/generic/neon/u8.cpp
+++ b/src/cpu/kernels/lut/generic/neon/u8.cpp

@@ -32,376 +32,374 @@
 #ifdef __aarch64__
 
 void lut_u8_neon(
-    const uint8_t        *table,
-    size_t                num_strings,
-    size_t                string_length,
-    const uint8_t *const *input,
-    uint8_t *const       *output)
+    const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
 {
-    __asm__ __volatile__(
-        "ldr q16, [%x[table], #0x0]\n"
-        "ldr q17, [%x[table], #0x10]\n"
-        "mov x23, #0x0\n"
-        "ldr q18, [%x[table], #0x20]\n"
-        "ldr q19, [%x[table], #0x30]\n"
-        "ldr q20, [%x[table], #0x40]\n"
-        "ldr q21, [%x[table], #0x50]\n"
-        "ldr q22, [%x[table], #0x60]\n"
-        "ldr q23, [%x[table], #0x70]\n"
-        "ldr q24, [%x[table], #0x80]\n"
-        "ldr q25, [%x[table], #0x90]\n"
-        "ldr q26, [%x[table], #0xa0]\n"
-        "ldr q27, [%x[table], #0xb0]\n"
-        "ldr q28, [%x[table], #0xc0]\n"
-        "ldr q29, [%x[table], #0xd0]\n"
-        "ldr q30, [%x[table], #0xe0]\n"
-        "ldr q31, [%x[table], #0xf0]\n"
-        "1:" // string loop
-        "ldr x22, [%x[input], x23, LSL #0x3]\n"
-        "ldr x21, [%x[output], x23, LSL #0x3]\n"
-        "movi v11.16b, #0x40\n"
-        "movi v10.16b, #0x80\n"
-        "movi v9.16b, #0xc0\n"
-        "mov x20, %x[string_length]\n"
-        "2:" // 4 rounds: width loop
-        "cmp x20, #0x30\n"
-        "bge 27f\n"
-        "tbz x20, #5, 10f\n"
-        "ld1 { v8.16b }, [x22], #0x10\n"
-        "ld1 { v13.16b }, [x22], #0x10\n"
-        "tbz x20, #3, 6f\n"
-        "ldr d12, [x22], #0x8\n"
-        "tbz x20, #2, 4f\n"
-        "ld1 { v12.s }[2], [x22], #0x4\n"
-        "tbz x20, #1, 3f\n"
-        "ld1 { v12.h }[6], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[14], [x22]\n"
-        "b 26f\n"
-        "3:" // 4 rounds: Partial load: partial_1_44
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[12], [x22]\n"
-        "b 26f\n"
-        "4:" // 4 rounds: Partial load: partial_2_40
-        "tbz x20, #1, 5f\n"
-        "ld1 { v12.h }[4], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[10], [x22]\n"
-        "b 26f\n"
-        "5:" // 4 rounds: Partial load: partial_1_40
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[8], [x22]\n"
-        "b 26f\n"
-        "6:" // 4 rounds: Partial load: partial_4_32
-        "tbz x20, #2, 8f\n"
-        "ldr s12, [x22], #0x4\n"
-        "tbz x20, #1, 7f\n"
-        "ld1 { v12.h }[2], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[6], [x22]\n"
-        "b 26f\n"
-        "7:" // 4 rounds: Partial load: partial_1_36
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[4], [x22]\n"
-        "b 26f\n"
-        "8:" // 4 rounds: Partial load: partial_2_32
-        "tbz x20, #1, 9f\n"
-        "ldr h12, [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v12.b }[2], [x22]\n"
-        "b 26f\n"
-        "9:" // 4 rounds: Partial load: partial_1_32
-        "tbz x20, #0, 26f\n"
-        "ldr b12, [x22, #0x0]\n"
-        "b 26f\n"
-        "10:" // 4 rounds: Partial load: partial_16_0
-        "tbz x20, #4, 18f\n"
-        "ld1 { v8.16b }, [x22], #0x10\n"
-        "tbz x20, #3, 14f\n"
-        "ldr d13, [x22], #0x8\n"
-        "tbz x20, #2, 12f\n"
-        "ld1 { v13.s }[2], [x22], #0x4\n"
-        "tbz x20, #1, 11f\n"
-        "ld1 { v13.h }[6], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[14], [x22]\n"
-        "b 26f\n"
-        "11:" // 4 rounds: Partial load: partial_1_28
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[12], [x22]\n"
-        "b 26f\n"
-        "12:" // 4 rounds: Partial load: partial_2_24
-        "tbz x20, #1, 13f\n"
-        "ld1 { v13.h }[4], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[10], [x22]\n"
-        "b 26f\n"
-        "13:" // 4 rounds: Partial load: partial_1_24
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[8], [x22]\n"
-        "b 26f\n"
-        "14:" // 4 rounds: Partial load: partial_4_16
-        "tbz x20, #2, 16f\n"
-        "ldr s13, [x22], #0x4\n"
-        "tbz x20, #1, 15f\n"
-        "ld1 { v13.h }[2], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[6], [x22]\n"
-        "b 26f\n"
-        "15:" // 4 rounds: Partial load: partial_1_20
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[4], [x22]\n"
-        "b 26f\n"
-        "16:" // 4 rounds: Partial load: partial_2_16
-        "tbz x20, #1, 17f\n"
-        "ldr h13, [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v13.b }[2], [x22]\n"
-        "b 26f\n"
-        "17:" // 4 rounds: Partial load: partial_1_16
-        "tbz x20, #0, 26f\n"
-        "ldr b13, [x22, #0x0]\n"
-        "b 26f\n"
-        "18:" // 4 rounds: Partial load: partial_8_0
-        "tbz x20, #3, 22f\n"
-        "ldr d8, [x22], #0x8\n"
-        "tbz x20, #2, 20f\n"
-        "ld1 { v8.s }[2], [x22], #0x4\n"
-        "tbz x20, #1, 19f\n"
-        "ld1 { v8.h }[6], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[14], [x22]\n"
-        "b 26f\n"
-        "19:" // 4 rounds: Partial load: partial_1_12
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[12], [x22]\n"
-        "b 26f\n"
-        "20:" // 4 rounds: Partial load: partial_2_8
-        "tbz x20, #1, 21f\n"
-        "ld1 { v8.h }[4], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[10], [x22]\n"
-        "b 26f\n"
-        "21:" // 4 rounds: Partial load: partial_1_8
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[8], [x22]\n"
-        "b 26f\n"
-        "22:" // 4 rounds: Partial load: partial_4_0
-        "tbz x20, #2, 24f\n"
-        "ldr s8, [x22], #0x4\n"
-        "tbz x20, #1, 23f\n"
-        "ld1 { v8.h }[2], [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[6], [x22]\n"
-        "b 26f\n"
-        "23:" // 4 rounds: Partial load: partial_1_4
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[4], [x22]\n"
-        "b 26f\n"
-        "24:" // 4 rounds: Partial load: partial_2_0
-        "tbz x20, #1, 25f\n"
-        "ldr h8, [x22], #0x2\n"
-        "tbz x20, #0, 26f\n"
-        "ld1 { v8.b }[2], [x22]\n"
-        "b 26f\n"
-        "25:" // 4 rounds: Partial load: partial_1_0
-        "ldr b8, [x22, #0x0]\n"
-        "26:" // 4 rounds: Partial load: Done
-        "b 28f\n"
-        "27:" // 4 rounds: Full load
-        "ldr q8, [x22, #0x0]\n"
-        "ldr q13, [x22, #0x10]\n"
-        "ldr q12, [x22, #0x20]\n"
-        "add x22, x22, #0x30\n"
-        "28:" // 4 rounds: Load done
-        "sub v0.16b, v8.16b, v11.16b\n"
-        "sub v7.16b, v8.16b, v10.16b\n"
-        "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n"
-        "sub v6.16b, v8.16b, v9.16b\n"
-        "sub v5.16b, v13.16b, v11.16b\n"
-        "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n"
-        "sub v4.16b, v13.16b, v10.16b\n"
-        "sub v3.16b, v13.16b, v9.16b\n"
-        "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
-        "sub v2.16b, v12.16b, v11.16b\n"
-        "sub v1.16b, v12.16b, v10.16b\n"
-        "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
-        "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
-        "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
-        "orr v8.16b, v8.16b, v0.16b\n"
-        "sub v0.16b, v12.16b, v9.16b\n"
-        "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
-        "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
-        "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n"
-        "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
-        "orr v7.16b, v7.16b, v6.16b\n"
-        "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
-        "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
-        "orr v13.16b, v13.16b, v5.16b\n"
-        "orr v4.16b, v4.16b, v3.16b\n"
-        "orr v12.16b, v12.16b, v2.16b\n"
-        "cmp x20, #0x30\n"
-        "orr v1.16b, v1.16b, v0.16b\n"
-        "orr v8.16b, v8.16b, v7.16b\n"
-        "orr v13.16b, v13.16b, v4.16b\n"
-        "orr v12.16b, v12.16b, v1.16b\n"
-        "bge 53f\n"
-        "tbz x20, #5, 36f\n"
-        "st1 { v8.16b }, [x21], #0x10\n"
-        "st1 { v13.16b }, [x21], #0x10\n"
-        "tbz x20, #3, 32f\n"
-        "str d12, [x21], #0x8\n"
-        "tbz x20, #2, 30f\n"
-        "st1 { v12.s }[2], [x21], #0x4\n"
-        "tbz x20, #1, 29f\n"
-        "st1 { v12.h }[6], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[14], [x21]\n"
-        "b 52f\n"
-        "29:" // 4 rounds: Partial writeback: partial_1_44
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[12], [x21]\n"
-        "b 52f\n"
-        "30:" // 4 rounds: Partial writeback: partial_2_40
-        "tbz x20, #1, 31f\n"
-        "st1 { v12.h }[4], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[10], [x21]\n"
-        "b 52f\n"
-        "31:" // 4 rounds: Partial writeback: partial_1_40
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[8], [x21]\n"
-        "b 52f\n"
-        "32:" // 4 rounds: Partial writeback: partial_4_32
-        "tbz x20, #2, 34f\n"
-        "str s12, [x21], #0x4\n"
-        "tbz x20, #1, 33f\n"
-        "st1 { v12.h }[2], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[6], [x21]\n"
-        "b 52f\n"
-        "33:" // 4 rounds: Partial writeback: partial_1_36
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[4], [x21]\n"
-        "b 52f\n"
-        "34:" // 4 rounds: Partial writeback: partial_2_32
-        "tbz x20, #1, 35f\n"
-        "str h12, [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v12.b }[2], [x21]\n"
-        "b 52f\n"
-        "35:" // 4 rounds: Partial writeback: partial_1_32
-        "tbz x20, #0, 52f\n"
-        "str b12, [x21, #0x0]\n"
-        "b 52f\n"
-        "36:" // 4 rounds: Partial writeback: partial_16_0
-        "tbz x20, #4, 44f\n"
-        "st1 { v8.16b }, [x21], #0x10\n"
-        "tbz x20, #3, 40f\n"
-        "str d13, [x21], #0x8\n"
-        "tbz x20, #2, 38f\n"
-        "st1 { v13.s }[2], [x21], #0x4\n"
-        "tbz x20, #1, 37f\n"
-        "st1 { v13.h }[6], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[14], [x21]\n"
-        "b 52f\n"
-        "37:" // 4 rounds: Partial writeback: partial_1_28
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[12], [x21]\n"
-        "b 52f\n"
-        "38:" // 4 rounds: Partial writeback: partial_2_24
-        "tbz x20, #1, 39f\n"
-        "st1 { v13.h }[4], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[10], [x21]\n"
-        "b 52f\n"
-        "39:" // 4 rounds: Partial writeback: partial_1_24
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[8], [x21]\n"
-        "b 52f\n"
-        "40:" // 4 rounds: Partial writeback: partial_4_16
-        "tbz x20, #2, 42f\n"
-        "str s13, [x21], #0x4\n"
-        "tbz x20, #1, 41f\n"
-        "st1 { v13.h }[2], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[6], [x21]\n"
-        "b 52f\n"
-        "41:" // 4 rounds: Partial writeback: partial_1_20
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[4], [x21]\n"
-        "b 52f\n"
-        "42:" // 4 rounds: Partial writeback: partial_2_16
-        "tbz x20, #1, 43f\n"
-        "str h13, [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v13.b }[2], [x21]\n"
-        "b 52f\n"
-        "43:" // 4 rounds: Partial writeback: partial_1_16
-        "tbz x20, #0, 52f\n"
-        "str b13, [x21, #0x0]\n"
-        "b 52f\n"
-        "44:" // 4 rounds: Partial writeback: partial_8_0
-        "tbz x20, #3, 48f\n"
-        "str d8, [x21], #0x8\n"
-        "tbz x20, #2, 46f\n"
-        "st1 { v8.s }[2], [x21], #0x4\n"
-        "tbz x20, #1, 45f\n"
-        "st1 { v8.h }[6], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[14], [x21]\n"
-        "b 52f\n"
-        "45:" // 4 rounds: Partial writeback: partial_1_12
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[12], [x21]\n"
-        "b 52f\n"
-        "46:" // 4 rounds: Partial writeback: partial_2_8
-        "tbz x20, #1, 47f\n"
-        "st1 { v8.h }[4], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[10], [x21]\n"
-        "b 52f\n"
-        "47:" // 4 rounds: Partial writeback: partial_1_8
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[8], [x21]\n"
-        "b 52f\n"
-        "48:" // 4 rounds: Partial writeback: partial_4_0
-        "tbz x20, #2, 50f\n"
-        "str s8, [x21], #0x4\n"
-        "tbz x20, #1, 49f\n"
-        "st1 { v8.h }[2], [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[6], [x21]\n"
-        "b 52f\n"
-        "49:" // 4 rounds: Partial writeback: partial_1_4
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[4], [x21]\n"
-        "b 52f\n"
-        "50:" // 4 rounds: Partial writeback: partial_2_0
-        "tbz x20, #1, 51f\n"
-        "str h8, [x21], #0x2\n"
-        "tbz x20, #0, 52f\n"
-        "st1 { v8.b }[2], [x21]\n"
-        "b 52f\n"
-        "51:" // 4 rounds: Partial writeback: partial_1_0
-        "str b8, [x21, #0x0]\n"
-        "52:" // 4 rounds: Partial writeback: Done
-        "b 54f\n"
-        "53:" // 4 rounds: Full writeback
-        "str q8, [x21, #0x0]\n"
-        "str q13, [x21, #0x10]\n"
-        "str q12, [x21, #0x20]\n"
-        "add x21, x21, #0x30\n"
-        "54:" // 4 rounds: Writeback done
-        "subs x20, x20, #0x30\n"
-        "bgt 2b\n"
-        "add x23, x23, #0x1\n"
-        "cmp x23, %x[num_strings]\n"
-        "bne 1b\n"
-        :
-        : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length), [table] "r"(table)
-        : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11", "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26", "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23");
+    __asm__ __volatile__("ldr q16, [%x[table], #0x0]\n"
+                         "ldr q17, [%x[table], #0x10]\n"
+                         "mov x23, #0x0\n"
+                         "ldr q18, [%x[table], #0x20]\n"
+                         "ldr q19, [%x[table], #0x30]\n"
+                         "ldr q20, [%x[table], #0x40]\n"
+                         "ldr q21, [%x[table], #0x50]\n"
+                         "ldr q22, [%x[table], #0x60]\n"
+                         "ldr q23, [%x[table], #0x70]\n"
+                         "ldr q24, [%x[table], #0x80]\n"
+                         "ldr q25, [%x[table], #0x90]\n"
+                         "ldr q26, [%x[table], #0xa0]\n"
+                         "ldr q27, [%x[table], #0xb0]\n"
+                         "ldr q28, [%x[table], #0xc0]\n"
+                         "ldr q29, [%x[table], #0xd0]\n"
+                         "ldr q30, [%x[table], #0xe0]\n"
+                         "ldr q31, [%x[table], #0xf0]\n"
+                         "1:" // string loop
+                         "ldr x22, [%x[input], x23, LSL #0x3]\n"
+                         "ldr x21, [%x[output], x23, LSL #0x3]\n"
+                         "movi v11.16b, #0x40\n"
+                         "movi v10.16b, #0x80\n"
+                         "movi v9.16b, #0xc0\n"
+                         "mov x20, %x[string_length]\n"
+                         "2:" // 4 rounds: width loop
+                         "cmp x20, #0x30\n"
+                         "bge 27f\n"
+                         "tbz x20, #5, 10f\n"
+                         "ld1 { v8.16b }, [x22], #0x10\n"
+                         "ld1 { v13.16b }, [x22], #0x10\n"
+                         "tbz x20, #3, 6f\n"
+                         "ldr d12, [x22], #0x8\n"
+                         "tbz x20, #2, 4f\n"
+                         "ld1 { v12.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 3f\n"
+                         "ld1 { v12.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "3:" // 4 rounds: Partial load: partial_1_44
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "4:" // 4 rounds: Partial load: partial_2_40
+                         "tbz x20, #1, 5f\n"
+                         "ld1 { v12.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "5:" // 4 rounds: Partial load: partial_1_40
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "6:" // 4 rounds: Partial load: partial_4_32
+                         "tbz x20, #2, 8f\n"
+                         "ldr s12, [x22], #0x4\n"
+                         "tbz x20, #1, 7f\n"
+                         "ld1 { v12.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "7:" // 4 rounds: Partial load: partial_1_36
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "8:" // 4 rounds: Partial load: partial_2_32
+                         "tbz x20, #1, 9f\n"
+                         "ldr h12, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v12.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "9:" // 4 rounds: Partial load: partial_1_32
+                         "tbz x20, #0, 26f\n"
+                         "ldr b12, [x22, #0x0]\n"
+                         "b 26f\n"
+                         "10:" // 4 rounds: Partial load: partial_16_0
+                         "tbz x20, #4, 18f\n"
+                         "ld1 { v8.16b }, [x22], #0x10\n"
+                         "tbz x20, #3, 14f\n"
+                         "ldr d13, [x22], #0x8\n"
+                         "tbz x20, #2, 12f\n"
+                         "ld1 { v13.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 11f\n"
+                         "ld1 { v13.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "11:" // 4 rounds: Partial load: partial_1_28
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "12:" // 4 rounds: Partial load: partial_2_24
+                         "tbz x20, #1, 13f\n"
+                         "ld1 { v13.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "13:" // 4 rounds: Partial load: partial_1_24
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "14:" // 4 rounds: Partial load: partial_4_16
+                         "tbz x20, #2, 16f\n"
+                         "ldr s13, [x22], #0x4\n"
+                         "tbz x20, #1, 15f\n"
+                         "ld1 { v13.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "15:" // 4 rounds: Partial load: partial_1_20
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "16:" // 4 rounds: Partial load: partial_2_16
+                         "tbz x20, #1, 17f\n"
+                         "ldr h13, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v13.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "17:" // 4 rounds: Partial load: partial_1_16
+                         "tbz x20, #0, 26f\n"
+                         "ldr b13, [x22, #0x0]\n"
+                         "b 26f\n"
+                         "18:" // 4 rounds: Partial load: partial_8_0
+                         "tbz x20, #3, 22f\n"
+                         "ldr d8, [x22], #0x8\n"
+                         "tbz x20, #2, 20f\n"
+                         "ld1 { v8.s }[2], [x22], #0x4\n"
+                         "tbz x20, #1, 19f\n"
+                         "ld1 { v8.h }[6], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[14], [x22]\n"
+                         "b 26f\n"
+                         "19:" // 4 rounds: Partial load: partial_1_12
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[12], [x22]\n"
+                         "b 26f\n"
+                         "20:" // 4 rounds: Partial load: partial_2_8
+                         "tbz x20, #1, 21f\n"
+                         "ld1 { v8.h }[4], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[10], [x22]\n"
+                         "b 26f\n"
+                         "21:" // 4 rounds: Partial load: partial_1_8
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[8], [x22]\n"
+                         "b 26f\n"
+                         "22:" // 4 rounds: Partial load: partial_4_0
+                         "tbz x20, #2, 24f\n"
+                         "ldr s8, [x22], #0x4\n"
+                         "tbz x20, #1, 23f\n"
+                         "ld1 { v8.h }[2], [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[6], [x22]\n"
+                         "b 26f\n"
+                         "23:" // 4 rounds: Partial load: partial_1_4
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[4], [x22]\n"
+                         "b 26f\n"
+                         "24:" // 4 rounds: Partial load: partial_2_0
+                         "tbz x20, #1, 25f\n"
+                         "ldr h8, [x22], #0x2\n"
+                         "tbz x20, #0, 26f\n"
+                         "ld1 { v8.b }[2], [x22]\n"
+                         "b 26f\n"
+                         "25:" // 4 rounds: Partial load: partial_1_0
+                         "ldr b8, [x22, #0x0]\n"
+                         "26:" // 4 rounds: Partial load: Done
+                         "b 28f\n"
+                         "27:" // 4 rounds: Full load
+                         "ldr q8, [x22, #0x0]\n"
+                         "ldr q13, [x22, #0x10]\n"
+                         "ldr q12, [x22, #0x20]\n"
+                         "add x22, x22, #0x30\n"
+                         "28:" // 4 rounds: Load done
+                         "sub v0.16b, v8.16b, v11.16b\n"
+                         "sub v7.16b, v8.16b, v10.16b\n"
+                         "tbl v0.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v0.16b\n"
+                         "sub v6.16b, v8.16b, v9.16b\n"
+                         "sub v5.16b, v13.16b, v11.16b\n"
+                         "tbl v8.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v8.16b\n"
+                         "sub v4.16b, v13.16b, v10.16b\n"
+                         "sub v3.16b, v13.16b, v9.16b\n"
+                         "tbl v7.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v7.16b\n"
+                         "sub v2.16b, v12.16b, v11.16b\n"
+                         "sub v1.16b, v12.16b, v10.16b\n"
+                         "tbl v6.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v6.16b\n"
+                         "tbl v13.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v13.16b\n"
+                         "tbl v5.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v5.16b\n"
+                         "orr v8.16b, v8.16b, v0.16b\n"
+                         "sub v0.16b, v12.16b, v9.16b\n"
+                         "tbl v4.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v4.16b\n"
+                         "tbl v3.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v3.16b\n"
+                         "tbl v12.16b, { v16.16b, v17.16b, v18.16b, v19.16b }, v12.16b\n"
+                         "tbl v2.16b, { v20.16b, v21.16b, v22.16b, v23.16b }, v2.16b\n"
+                         "orr v7.16b, v7.16b, v6.16b\n"
+                         "tbl v1.16b, { v24.16b, v25.16b, v26.16b, v27.16b }, v1.16b\n"
+                         "tbl v0.16b, { v28.16b, v29.16b, v30.16b, v31.16b }, v0.16b\n"
+                         "orr v13.16b, v13.16b, v5.16b\n"
+                         "orr v4.16b, v4.16b, v3.16b\n"
+                         "orr v12.16b, v12.16b, v2.16b\n"
+                         "cmp x20, #0x30\n"
+                         "orr v1.16b, v1.16b, v0.16b\n"
+                         "orr v8.16b, v8.16b, v7.16b\n"
+                         "orr v13.16b, v13.16b, v4.16b\n"
+                         "orr v12.16b, v12.16b, v1.16b\n"
+                         "bge 53f\n"
+                         "tbz x20, #5, 36f\n"
+                         "st1 { v8.16b }, [x21], #0x10\n"
+                         "st1 { v13.16b }, [x21], #0x10\n"
+                         "tbz x20, #3, 32f\n"
+                         "str d12, [x21], #0x8\n"
+                         "tbz x20, #2, 30f\n"
+                         "st1 { v12.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 29f\n"
+                         "st1 { v12.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "29:" // 4 rounds: Partial writeback: partial_1_44
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "30:" // 4 rounds: Partial writeback: partial_2_40
+                         "tbz x20, #1, 31f\n"
+                         "st1 { v12.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "31:" // 4 rounds: Partial writeback: partial_1_40
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "32:" // 4 rounds: Partial writeback: partial_4_32
+                         "tbz x20, #2, 34f\n"
+                         "str s12, [x21], #0x4\n"
+                         "tbz x20, #1, 33f\n"
+                         "st1 { v12.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "33:" // 4 rounds: Partial writeback: partial_1_36
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "34:" // 4 rounds: Partial writeback: partial_2_32
+                         "tbz x20, #1, 35f\n"
+                         "str h12, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v12.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "35:" // 4 rounds: Partial writeback: partial_1_32
+                         "tbz x20, #0, 52f\n"
+                         "str b12, [x21, #0x0]\n"
+                         "b 52f\n"
+                         "36:" // 4 rounds: Partial writeback: partial_16_0
+                         "tbz x20, #4, 44f\n"
+                         "st1 { v8.16b }, [x21], #0x10\n"
+                         "tbz x20, #3, 40f\n"
+                         "str d13, [x21], #0x8\n"
+                         "tbz x20, #2, 38f\n"
+                         "st1 { v13.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 37f\n"
+                         "st1 { v13.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "37:" // 4 rounds: Partial writeback: partial_1_28
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "38:" // 4 rounds: Partial writeback: partial_2_24
+                         "tbz x20, #1, 39f\n"
+                         "st1 { v13.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "39:" // 4 rounds: Partial writeback: partial_1_24
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "40:" // 4 rounds: Partial writeback: partial_4_16
+                         "tbz x20, #2, 42f\n"
+                         "str s13, [x21], #0x4\n"
+                         "tbz x20, #1, 41f\n"
+                         "st1 { v13.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "41:" // 4 rounds: Partial writeback: partial_1_20
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "42:" // 4 rounds: Partial writeback: partial_2_16
+                         "tbz x20, #1, 43f\n"
+                         "str h13, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v13.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "43:" // 4 rounds: Partial writeback: partial_1_16
+                         "tbz x20, #0, 52f\n"
+                         "str b13, [x21, #0x0]\n"
+                         "b 52f\n"
+                         "44:" // 4 rounds: Partial writeback: partial_8_0
+                         "tbz x20, #3, 48f\n"
+                         "str d8, [x21], #0x8\n"
+                         "tbz x20, #2, 46f\n"
+                         "st1 { v8.s }[2], [x21], #0x4\n"
+                         "tbz x20, #1, 45f\n"
+                         "st1 { v8.h }[6], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[14], [x21]\n"
+                         "b 52f\n"
+                         "45:" // 4 rounds: Partial writeback: partial_1_12
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[12], [x21]\n"
+                         "b 52f\n"
+                         "46:" // 4 rounds: Partial writeback: partial_2_8
+                         "tbz x20, #1, 47f\n"
+                         "st1 { v8.h }[4], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[10], [x21]\n"
+                         "b 52f\n"
+                         "47:" // 4 rounds: Partial writeback: partial_1_8
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[8], [x21]\n"
+                         "b 52f\n"
+                         "48:" // 4 rounds: Partial writeback: partial_4_0
+                         "tbz x20, #2, 50f\n"
+                         "str s8, [x21], #0x4\n"
+                         "tbz x20, #1, 49f\n"
+                         "st1 { v8.h }[2], [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[6], [x21]\n"
+                         "b 52f\n"
+                         "49:" // 4 rounds: Partial writeback: partial_1_4
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[4], [x21]\n"
+                         "b 52f\n"
+                         "50:" // 4 rounds: Partial writeback: partial_2_0
+                         "tbz x20, #1, 51f\n"
+                         "str h8, [x21], #0x2\n"
+                         "tbz x20, #0, 52f\n"
+                         "st1 { v8.b }[2], [x21]\n"
+                         "b 52f\n"
+                         "51:" // 4 rounds: Partial writeback: partial_1_0
+                         "str b8, [x21, #0x0]\n"
+                         "52:" // 4 rounds: Partial writeback: Done
+                         "b 54f\n"
+                         "53:" // 4 rounds: Full writeback
+                         "str q8, [x21, #0x0]\n"
+                         "str q13, [x21, #0x10]\n"
+                         "str q12, [x21, #0x20]\n"
+                         "add x21, x21, #0x30\n"
+                         "54:" // 4 rounds: Writeback done
+                         "subs x20, x20, #0x30\n"
+                         "bgt 2b\n"
+                         "add x23, x23, #0x1\n"
+                         "cmp x23, %x[num_strings]\n"
+                         "bne 1b\n"
+                         :
+                         : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output),
+                           [string_length] "r"(string_length), [table] "r"(table)
+                         : "cc", "memory", "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11",
+                           "v12", "v13", "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23", "v24", "v25", "v26",
+                           "v27", "v28", "v29", "v30", "v31", "x20", "x21", "x22", "x23");
 }
 
 #endif // __aarch64__

diff --git a/src/cpu/kernels/lut/generic/sve2/u8.cpp b/src/cpu/kernels/lut/generic/sve2/u8.cpp
index b80d753..ee85727 100644
--- a/src/cpu/kernels/lut/generic/sve2/u8.cpp
+++ b/src/cpu/kernels/lut/generic/sve2/u8.cpp

@@ -32,11 +32,7 @@
 namespace cpu
 {
 void lut_u8_sve2(
-    const uint8_t        *table,
-    size_t                num_strings,
-    size_t                string_length,
-    const uint8_t *const *input,
-    uint8_t *const       *output)
+    const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, uint8_t *const *output)
 {
     __asm__ __volatile__(
         "ptrue p0.b\n"
@@ -636,7 +632,9 @@
         "bne 2b\n"
         : [table] "+&r"(table)
         : [input] "r"(input), [num_strings] "r"(num_strings), [output] "r"(output), [string_length] "r"(string_length)
-        : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1", "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21", "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
+        : "cc", "memory", "p0", "p1", "p2", "p3", "p4", "p5", "x20", "x21", "x22", "x23", "x24", "x25", "z0", "z1",
+          "z2", "z3", "z4", "z5", "z6", "z7", "z8", "z9", "z10", "z11", "z12", "z16", "z17", "z18", "z19", "z20", "z21",
+          "z22", "z23", "z24", "z25", "z26", "z27", "z28", "z29", "z30", "z31");
 }
 
 } // namespace cpu

diff --git a/src/cpu/kernels/lut/list.h b/src/cpu/kernels/lut/list.h
index 7a2afc6..da90346 100644
--- a/src/cpu/kernels/lut/list.h
+++ b/src/cpu/kernels/lut/list.h

@@ -34,13 +34,9 @@
 {
 
 #ifdef __aarch64__
-#define DECLARE_LUT_KERNEL(func_name) \
-    void func_name( \
-        const uint8_t        *table, \
-        size_t                num_strings, \
-        size_t                string_length, \
-        const uint8_t *const *input, \
-        uint8_t *const       *output)
+#define DECLARE_LUT_KERNEL(func_name)                                                                           \
+    void func_name(const uint8_t *table, size_t num_strings, size_t string_length, const uint8_t *const *input, \
+                   uint8_t *const *output)
 
 DECLARE_LUT_KERNEL(lut_u8_neon);
 DECLARE_LUT_KERNEL(lut_u8_sve2);

diff --git a/src/cpu/kernels/maxunpool/generic/neon/impl.h b/src/cpu/kernels/maxunpool/generic/neon/impl.h
index 5fe19c4..73a5b86 100644
--- a/src/cpu/kernels/maxunpool/generic/neon/impl.h
+++ b/src/cpu/kernels/maxunpool/generic/neon/impl.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_CPU_KERNELS_MAXUNPOOL_GENERIC_NEON_IMPL_H
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 namespace arm_compute
 {
@@ -37,13 +38,15 @@
     Iterator  indices_itr(indices, window);
     auto      out_ptr      = reinterpret_cast<T *>(output->buffer());
     const int out_stride_w = static_cast<int>(output->info()->strides_in_bytes()[3]);
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto vindices                                         = reinterpret_cast<uint32_t *>(indices_itr.ptr());
-        auto vinput                                           = reinterpret_cast<T *>(input_itr.ptr());
-        out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
-    },
-    input_itr, indices_itr);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            auto vindices                                         = reinterpret_cast<uint32_t *>(indices_itr.ptr());
+            auto vinput                                           = reinterpret_cast<T *>(input_itr.ptr());
+            out_ptr[id[3] * out_stride_w / sizeof(T) + *vindices] = *vinput;
+        },
+        input_itr, indices_itr);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
index 96e4030..6470f39 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/fp16.cpp

@@ -23,9 +23,9 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
-#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/CpuTypes.h"
+#include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
 
 namespace arm_compute
 {
@@ -45,64 +45,66 @@
     Iterator input_itr(input, win);
     Iterator output_itr(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const float16_t *>(input_itr.ptr());
-        auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
-
-        float16x8_t sum_vec    = vdupq_n_f16(static_cast<float16_t>(0.0f));
-        float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            float16x8_t data = vld1q_f16(in_ptr + x);
-            sum_vec          = vaddq_f16(sum_vec, data);
-            float32x4_t dl   = vcvt_f32_f16(vget_low_f16(data));
-            float32x4_t dh   = vcvt_f32_f16(vget_high_f16(data));
-            sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
-            sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
-        }
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const float16_t *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<float16_t *>(output_itr.ptr());
 
-        float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec));
-        sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
-        sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
+            float16x8_t sum_vec    = vdupq_n_f16(static_cast<float16_t>(0.0f));
+            float32x4_t sum_sq_vec = vdupq_n_f32(0.0f);
 
-        float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec);
-        sum_sq_carry_res             = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res);
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8_t data = vld1q_f16(in_ptr + x);
+                sum_vec          = vaddq_f16(sum_vec, data);
+                float32x4_t dl   = vcvt_f32_f16(vget_low_f16(data));
+                float32x4_t dh   = vcvt_f32_f16(vget_high_f16(data));
+                sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dl, dl));
+                sum_sq_vec       = vaddq_f32(sum_sq_vec, vmulq_f32(dh, dh));
+            }
 
-        float16_t sum    = vget_lane_f16(sum_carry_res, 0);
-        float     sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0);
+            float16x4_t sum_carry_res = vpadd_f16(vget_high_f16(sum_vec), vget_low_f16(sum_vec));
+            sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
+            sum_carry_res             = vpadd_f16(sum_carry_res, sum_carry_res);
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            float16_t data = *(in_ptr + x);
-            sum += data;
-            float fdata = static_cast<float>(data);
-            sum_sq += fdata * fdata;
-        }
+            float32x4_t sum_sq_carry_res = vpaddq_f32(sum_sq_vec, sum_sq_vec);
+            sum_sq_carry_res             = vpaddq_f32(sum_sq_carry_res, sum_sq_carry_res);
 
-        float16_t mean       = sum / input->info()->dimension(0);
-        float     var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
-        float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon));
+            float16_t sum    = vget_lane_f16(sum_carry_res, 0);
+            float     sum_sq = vgetq_lane_f32(sum_sq_carry_res, 0);
 
-        float16x8_t mean_vec       = vdupq_n_f16(mean);
-        float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                float16_t data = *(in_ptr + x);
+                sum += data;
+                float fdata = static_cast<float>(data);
+                sum_sq += fdata * fdata;
+            }
 
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            float16x8_t data = vld1q_f16(in_ptr + x);
-            float16x8_t res  = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec);
-            // Store results
-            vst1q_f16(out_ptr + x, res);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
-        }
-    },
-    input_itr, output_itr);
+            float16_t mean       = sum / input->info()->dimension(0);
+            float     var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+            float16_t stddev_inv = static_cast<float16_t>(1.f / sqrt(var + epsilon));
+
+            float16x8_t mean_vec       = vdupq_n_f16(mean);
+            float16x8_t stddev_inv_vec = vdupq_n_f16(stddev_inv);
+
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                float16x8_t data = vld1q_f16(in_ptr + x);
+                float16x8_t res  = vmulq_f16(vsubq_f16(data, mean_vec), stddev_inv_vec);
+                // Store results
+                vst1q_f16(out_ptr + x, res);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+            }
+        },
+        input_itr, output_itr);
 }
 
 void neon_fp16_meanstddevnorm(ITensor *input, ITensor *output, float epsilon, const Window &window)

diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
index 0522d6e..11f6294 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/impl.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/meanstddevnorm/generic/neon/impl.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -45,60 +46,62 @@
     Iterator input_itr(input, win);
     Iterator output_itr(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const ScalarType *>(input_itr.ptr());
-        auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
-
-        auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-        auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            auto data  = wrapper::vloadq(in_ptr + x);
-            sum_vec    = wrapper::vadd(sum_vec, data);
-            sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
-        }
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const ScalarType *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<ScalarType *>(output_itr.ptr());
 
-        auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
-        auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
-        for(int i = 0; i < size / 4; ++i)
-        {
-            sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
-            sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
-        }
+            auto sum_vec    = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
+            auto sum_sq_vec = wrapper::vdup_n(static_cast<ScalarType>(0.f), ExactTagType{});
 
-        auto sum    = wrapper::vgetlane(sum_carry_res, 0);
-        auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto data  = wrapper::vloadq(in_ptr + x);
+                sum_vec    = wrapper::vadd(sum_vec, data);
+                sum_sq_vec = wrapper::vadd(sum_sq_vec, wrapper::vmul(data, data));
+            }
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            ScalarType data = *(in_ptr + x);
-            sum += data;
-            sum_sq += data * data;
-        }
+            auto sum_carry_res    = wrapper::vpadd(wrapper::vgethigh(sum_vec), wrapper::vgetlow(sum_vec));
+            auto sum_sq_carry_res = wrapper::vpadd(wrapper::vgethigh(sum_sq_vec), wrapper::vgetlow(sum_sq_vec));
+            for (int i = 0; i < size / 4; ++i)
+            {
+                sum_carry_res    = wrapper::vpadd(sum_carry_res, sum_carry_res);
+                sum_sq_carry_res = wrapper::vpadd(sum_sq_carry_res, sum_sq_carry_res);
+            }
 
-        ScalarType mean       = sum / input->info()->dimension(0);
-        ScalarType var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
-        ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
+            auto sum    = wrapper::vgetlane(sum_carry_res, 0);
+            auto sum_sq = wrapper::vgetlane(sum_sq_carry_res, 0);
 
-        auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
-        auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            auto data = wrapper::vloadq(in_ptr + x);
-            auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
-            // Store results
-            wrapper::vstore(out_ptr + x, res);
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
-        }
-    },
-    input_itr, output_itr);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                ScalarType data = *(in_ptr + x);
+                sum += data;
+                sum_sq += data * data;
+            }
+
+            ScalarType mean       = sum / input->info()->dimension(0);
+            ScalarType var        = (sum_sq / input->info()->dimension(0)) - (mean * mean);
+            ScalarType stddev_inv = 1.f / sqrt(var + epsilon);
+
+            auto mean_vec       = wrapper::vdup_n(mean, ExactTagType{});
+            auto stddev_inv_vec = wrapper::vdup_n(stddev_inv, ExactTagType{});
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                auto data = wrapper::vloadq(in_ptr + x);
+                auto res  = wrapper::vmul(wrapper::vsub(data, mean_vec), stddev_inv_vec);
+                // Store results
+                wrapper::vstore(out_ptr + x, res);
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(out_ptr + x) = (*(in_ptr + x) - mean) * stddev_inv;
+            }
+        },
+        input_itr, output_itr);
 }
 template void mean_stddev_normalization<float, 4>(ITensor *input, ITensor *output, float epsilon, const Window &window);
 } // namespace cpu

diff --git a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
index 53af1e4..32654df 100644
--- a/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/meanstddevnorm/generic/neon/qasymm8.cpp

@@ -23,6 +23,7 @@
  */
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
@@ -69,77 +70,76 @@
     const float32x4_t quant_min_vec    = vdupq_n_f32(0.0f);
 
     execute_window_loop(
-        win, [&](const Coordinates &)
-    {
-        int  x       = window_start_x;
-        auto in_ptr  = reinterpret_cast<const uint8_t *>(input_itr.ptr());
-        auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr());
-
-        uint32x4_t sum_vec    = vdupq_n_u32(0);
-        uint32x4_t sum_sq_vec = vdupq_n_u32(0);
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        win,
+        [&](const Coordinates &)
         {
-            const uint8x16_t data         = vld1q_u8(in_ptr + x);
-            sum_vec                       = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data)));
-            const uint16x8_t squares_low  = vmull_u8(vget_low_u8(data), vget_low_u8(data));
-            const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data));
-            sum_sq_vec                    = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high)));
-        }
+            int  x       = window_start_x;
+            auto in_ptr  = reinterpret_cast<const uint8_t *>(input_itr.ptr());
+            auto out_ptr = reinterpret_cast<uint8_t *>(output_itr.ptr());
+
+            uint32x4_t sum_vec    = vdupq_n_u32(0);
+            uint32x4_t sum_sq_vec = vdupq_n_u32(0);
+
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t data         = vld1q_u8(in_ptr + x);
+                sum_vec                       = vaddq_u32(sum_vec, vpaddlq_u16(vpaddlq_u8(data)));
+                const uint16x8_t squares_low  = vmull_u8(vget_low_u8(data), vget_low_u8(data));
+                const uint16x8_t squares_high = vmull_u8(vget_high_u8(data), vget_high_u8(data));
+                sum_sq_vec = vaddq_u32(sum_sq_vec, vaddq_u32(vpaddlq_u16(squares_low), vpaddlq_u16(squares_high)));
+            }
 
 #ifdef __aarch64__
-        sum_vec         = vpaddq_u32(sum_vec, sum_vec);
-        sum_vec         = vpaddq_u32(sum_vec, sum_vec);
-        uint32_t sum    = vgetq_lane_u32(sum_vec, 0);
-        sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
-        sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
-        uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0);
+            sum_vec         = vpaddq_u32(sum_vec, sum_vec);
+            sum_vec         = vpaddq_u32(sum_vec, sum_vec);
+            uint32_t sum    = vgetq_lane_u32(sum_vec, 0);
+            sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+            sum_sq_vec      = vpaddq_u32(sum_sq_vec, sum_sq_vec);
+            uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0);
 #elif __arm__ // #ifdef __aarch64__
-        uint32_t sum =  vgetq_lane_u32(sum_vec, 0) +
-                        vgetq_lane_u32(sum_vec, 1) +
-                        vgetq_lane_u32(sum_vec, 2) +
-                        vgetq_lane_u32(sum_vec, 3);
+            uint32_t sum = vgetq_lane_u32(sum_vec, 0) + vgetq_lane_u32(sum_vec, 1) + vgetq_lane_u32(sum_vec, 2) +
+                           vgetq_lane_u32(sum_vec, 3);
 
-        uint32_t sum_sq =   vgetq_lane_u32(sum_sq_vec, 0) +
-                            vgetq_lane_u32(sum_sq_vec, 1) +
-                            vgetq_lane_u32(sum_sq_vec, 2) +
-                            vgetq_lane_u32(sum_sq_vec, 3);
+            uint32_t sum_sq = vgetq_lane_u32(sum_sq_vec, 0) + vgetq_lane_u32(sum_sq_vec, 1) +
+                              vgetq_lane_u32(sum_sq_vec, 2) + vgetq_lane_u32(sum_sq_vec, 3);
 #endif        // #ifdef __aarch64__
-        for(; x < window_end_x; ++x)
-        {
-            auto data = static_cast<uint32_t>(*(in_ptr + x));
-            sum += data;
-            sum_sq += (data * data);
-        }
+            for (; x < window_end_x; ++x)
+            {
+                auto data = static_cast<uint32_t>(*(in_ptr + x));
+                sum += data;
+                sum_sq += (data * data);
+            }
 
-        const float       mean      = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0)));
-        const float       var       = (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean);
-        const float       stdev_inv = 1.0f / sqrtf(var + epsilon);
-        const float32x4_t v_scale   = vdupq_n_f32(stdev_inv * output_inv_scale);
-        const float32x4_t v_offset  = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset);
-        for(x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
-        {
-            const uint8x16_t data = vld1q_u8(in_ptr + x);
-            float32x4_t      db1  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data)))));
-            float32x4_t      db2  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data)))));
-            float32x4_t      db3  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data)))));
-            float32x4_t      db4  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data)))));
-            db1                   = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            db2                   = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            db3                   = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            db4                   = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec);
-            const uint8x16_t out  = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4));
-            vst1q_u8(out_ptr + x, out);
-        }
+            const float mean = (static_cast<float>(sum) / static_cast<float>(input->info()->dimension(0)));
+            const float var =
+                (static_cast<float>(sum_sq) / static_cast<float>(input->info()->dimension(0))) - (mean * mean);
+            const float       stdev_inv = 1.0f / sqrtf(var + epsilon);
+            const float32x4_t v_scale   = vdupq_n_f32(stdev_inv * output_inv_scale);
+            const float32x4_t v_offset  = vdupq_n_f32(-mean * stdev_inv * output_inv_scale + output_offset);
+            for (x = window_start_x; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const uint8x16_t data = vld1q_u8(in_ptr + x);
+                float32x4_t      db1  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_low_u8(data)))));
+                float32x4_t      db2  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_low_u8(data)))));
+                float32x4_t      db3  = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(vget_high_u8(data)))));
+                float32x4_t      db4  = vcvtq_f32_u32(vmovl_u16(vget_high_u16(vmovl_u8(vget_high_u8(data)))));
+                db1 = clamp_v4f32(vaddq_f32(vmulq_f32(db1, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db2 = clamp_v4f32(vaddq_f32(vmulq_f32(db2, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db3 = clamp_v4f32(vaddq_f32(vmulq_f32(db3, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                db4 = clamp_v4f32(vaddq_f32(vmulq_f32(db4, v_scale), v_offset), quant_min_vec, quant_max_vec);
+                const uint8x16_t out = fuse_shorts_u16(fuse_words_f32(db1, db2), fuse_words_f32(db3, db4));
+                vst1q_u8(out_ptr + x, out);
+            }
 
-        for(; x < window_end_x; ++x)
-        {
-            auto          data = static_cast<float32_t>(*(in_ptr + x));
-            const uint8_t res  = data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset);
-            *(out_ptr + x)     = res;
-        }
-    },
-    input_itr, output_itr);
+            for (; x < window_end_x; ++x)
+            {
+                auto          data = static_cast<float32_t>(*(in_ptr + x));
+                const uint8_t res =
+                    data * (stdev_inv * output_inv_scale) + (-mean * stdev_inv * output_inv_scale + output_offset);
+                *(out_ptr + x) = res;
+            }
+        },
+        input_itr, output_itr);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/pool2d/neon/fp16.cpp b/src/cpu/kernels/pool2d/neon/fp16.cpp
index 4e15d3a..4af59c2 100644
--- a/src/cpu/kernels/pool2d/neon/fp16.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp16.cpp

@@ -25,8 +25,9 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
@@ -37,7 +38,12 @@
 {
 namespace
 {
-void pooling2_f16_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_f16_maxpool_indices(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
@@ -53,8 +59,8 @@
     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
 
     const int pad_right      = src->info()->padding().right;
@@ -63,97 +69,114 @@
     const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
     const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
-            const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
-            const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
-            const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
-            const auto  v_x0      = vld1q_f16(in_x0_ptr);
-            const auto  v_x1      = vld1q_f16(in_x1_ptr);
-            const auto  v_x2      = vld1q_f16(in_x2_ptr);
-            const auto  v_x3      = vld1q_f16(in_x3_ptr);
-            float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
-            // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
 
-            const uint32_t   offset_base    = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0      = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t   offset_x1      = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t   offset_x2      = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3      = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32x4_t voffset_x0_0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x0_1   = { offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7 };
-            const uint16x8_t voffset_x0     = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
-            const uint32x4_t voffset_x1_0   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x1_1   = { offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7 };
-            const uint16x8_t voffset_x1     = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
-            const uint32x4_t voffset_x2_0   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x2_1   = { offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7 };
-            const uint16x8_t voffset_x2     = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
-            const uint32x4_t voffset_x3_0   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t voffset_x3_1   = { offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7 };
-            const uint16x8_t voffset_x3     = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
-            const uint16x8_t tmp_indices0   = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint16x8_t tmp_indices1   = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint16x8_t tmp_indices2   = vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
-            const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
-            const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
-            // Store indicies
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
-        }
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int in_x0_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x1_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x2_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x3_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            const auto x0  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
-            float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                const auto  in_x0_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off;
+                const auto  in_x1_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off;
+                const auto  in_x2_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off;
+                const auto  in_x3_ptr = reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off;
+                const auto  v_x0      = vld1q_f16(in_x0_ptr);
+                const auto  v_x1      = vld1q_f16(in_x1_ptr);
+                const auto  v_x2      = vld1q_f16(in_x2_ptr);
+                const auto  v_x3      = vld1q_f16(in_x3_ptr);
+                float16x8_t vres      = vmaxq_f16(vmaxq_f16(v_x2, v_x3), vmaxq_f16(v_x0, v_x1));
+                // Store result
+                vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
 
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
+                const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                          pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+                const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+                                           pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t   offset_x3    = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32x4_t voffset_x0_0 = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+                const uint32x4_t voffset_x0_1 = {offset_x0 + 4, offset_x0 + 5, offset_x0 + 6, offset_x0 + 7};
+                const uint16x8_t voffset_x0   = vcombine_u16(vmovn_u32(voffset_x0_0), vmovn_u32(voffset_x0_1));
+                const uint32x4_t voffset_x1_0 = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+                const uint32x4_t voffset_x1_1 = {offset_x1 + 4, offset_x1 + 5, offset_x1 + 6, offset_x1 + 7};
+                const uint16x8_t voffset_x1   = vcombine_u16(vmovn_u32(voffset_x1_0), vmovn_u32(voffset_x1_1));
+                const uint32x4_t voffset_x2_0 = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+                const uint32x4_t voffset_x2_1 = {offset_x2 + 4, offset_x2 + 5, offset_x2 + 6, offset_x2 + 7};
+                const uint16x8_t voffset_x2   = vcombine_u16(vmovn_u32(voffset_x2_0), vmovn_u32(voffset_x2_1));
+                const uint32x4_t voffset_x3_0 = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+                const uint32x4_t voffset_x3_1 = {offset_x3 + 4, offset_x3 + 5, offset_x3 + 6, offset_x3 + 7};
+                const uint16x8_t voffset_x3   = vcombine_u16(vmovn_u32(voffset_x3_0), vmovn_u32(voffset_x3_1));
+                const uint16x8_t tmp_indices0 = vbslq_u16(vcgeq_f16(v_x0, v_x1), voffset_x0, voffset_x1);
+                const uint16x8_t tmp_indices1 = vbslq_u16(vcgeq_f16(v_x2, v_x3), voffset_x2, voffset_x3);
+                const uint16x8_t tmp_indices2 =
+                    vbslq_u16(vcgeq_f16(vmaxq_f16(v_x0, v_x1), vmaxq_f16(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+                const uint32x4_t tmp_indeces3_0 = vmovl_u16(vget_low_u16(tmp_indices2));
+                const uint32x4_t tmp_indeces3_1 = vmovl_u16(vget_high_u16(tmp_indices2));
+                // Store indicies
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indeces3_0);
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr() + 16) + x_off, tmp_indeces3_1);
+            }
 
-            const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
-            const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                const auto x0  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x0_offset) + x_off);
+                const auto x1  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x1_offset) + x_off);
+                const auto x2  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x2_offset) + x_off);
+                const auto x3  = *(reinterpret_cast<const float16_t *>(in.ptr() + in_x3_offset) + x_off);
+                float16_t  res = std::max(std::max(x2, x3), std::max(x0, x1));
 
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
+
+                const uint32_t offset_base = offset_no_padding<float16_t>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                          pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = (uint32_t)offset_base / sizeof(float16_t) + x_off;
+                const uint32_t offset_x1   = (uint32_t)offset_x0 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t offset_x2   = (uint32_t)offset_x0 + in_stride_z / sizeof(float16_t) -
+                                           pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t offset_x3 = (uint32_t)offset_x2 + in_stride_y / sizeof(float16_t) - pad_horizontal;
+                const uint32_t tmp_idx0  = (x0 >= x1) ? offset_x0 : offset_x1;
+                const uint32_t tmp_idx1  = (x2 >= x3) ? offset_x2 : offset_x3;
+                const uint32_t tmp_idx2  = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+                // Store indices
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+            }
+        },
+        in, out, indices);
 }
-}
+} // namespace
 
-void poolingMxN_fp16_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp16_neon_nhwc(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
-    if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
+    if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && dst1)
     {
         pooling2_f16_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
     }
@@ -167,151 +190,172 @@
     Iterator in(src, window_src);
     Iterator out(dst0, window_out);
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int       upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int       upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
     const float16_t min_value     = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
     float16x8_t     vres;
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            if(pool_info.pool_type != PoolingType::MAX)
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
-                // Calculate scale
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-                const float16x8_t scale_v = vdupq_n_f16(scale);
-
-                // Perform pooling
-                vres = vdupq_n_f16(0.0f);
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                if (pool_info.pool_type != PoolingType::MAX)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float16x8_t scale_v = vdupq_n_f16(scale);
 
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
+                    // Perform pooling
+                    vres = vdupq_n_f16(0.0f);
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            vres = vaddq_f16(vres, vmulq_f16(data, data));
-                        }
-                        else
-                        {
-                            vres = vaddq_f16(vres, data);
+                            const float16x8_t data = vld1q_f16(
+                                reinterpret_cast<const float16_t *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if (pool_info.pool_type == PoolingType::L2)
+                            {
+                                vres = vaddq_f16(vres, vmulq_f16(data, data));
+                            }
+                            else
+                            {
+                                vres = vaddq_f16(vres, data);
+                            }
                         }
                     }
+                    // Divide by scale
+                    vres = vmulq_f16(vres, scale_v);
                 }
-                // Divide by scale
-                vres = vmulq_f16(vres, scale_v);
-            }
-            else
-            {
-                vres = vdupq_n_f16(min_value);
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                else
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    vres = vdupq_n_f16(min_value);
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                        vres                   = vmaxq_f16(vres, data);
-                    }
-                }
-            }
-
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
-                vres                        = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal));
-            }
-
-            // Store result
-            vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            float16_t res = 0.0f;
-
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                // Calculate scale
-                const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                   pool_stride_y);
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const float data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-
-                        // Get power of 2 in case of l2 pooling and accumulate
-                        if(pool_info.pool_type == PoolingType::L2)
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
                         {
-                            res += data * data;
-                        }
-                        else
-                        {
-                            res += data;
+                            const float16x8_t data = vld1q_f16(
+                                reinterpret_cast<const float16_t *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = vmaxq_f16(vres, data);
                         }
                     }
                 }
 
-                // Divide by scale
-                res *= scale;
-            }
-            else
-            {
-                res = min_value;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    float16x8_t sqrt_reciprocal = vrsqrteq_f16(vres);
+                    vres = vmulq_f16(vres, vmulq_f16(vrsqrtsq_f16(vmulq_f16(vres, sqrt_reciprocal), sqrt_reciprocal),
+                                                     sqrt_reciprocal));
+                }
+
+                // Store result
+                vst1q_f16(reinterpret_cast<float16_t *>(out.ptr()) + x_off, vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                float16_t res = 0.0f;
+
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    // Calculate scale
+                    const float16_t scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const float16_t data = *(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res                  = std::max(res, data);
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float data =
+                                *(reinterpret_cast<const float16_t *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+
+                            // Get power of 2 in case of l2 pooling and accumulate
+                            if (pool_info.pool_type == PoolingType::L2)
+                            {
+                                res += data * data;
+                            }
+                            else
+                            {
+                                res += data;
+                            }
+                        }
+                    }
+
+                    // Divide by scale
+                    res *= scale;
+                }
+                else
+                {
+                    res = min_value;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const float16_t data =
+                                *(reinterpret_cast<const float16_t *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res = std::max(res, data);
+                        }
                     }
                 }
-            }
 
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                res = std::sqrt(res);
-            }
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    res = std::sqrt(res);
+                }
 
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
-        }
-    },
-    in, out);
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x_off) = res;
+            }
+        },
+        in, out);
 }
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/pool2d/neon/fp32.cpp b/src/cpu/kernels/pool2d/neon/fp32.cpp
index a400f3a..aaa3786 100644
--- a/src/cpu/kernels/pool2d/neon/fp32.cpp
+++ b/src/cpu/kernels/pool2d/neon/fp32.cpp

@@ -24,8 +24,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 namespace arm_compute
@@ -34,7 +35,12 @@
 {
 namespace
 {
-void pooling2_f32_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_f32_maxpool_indices(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
@@ -50,8 +56,8 @@
     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
 
     float32x4_t vres;
@@ -63,89 +69,102 @@
     const int in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
     const int in_stride_z    = static_cast<int>(src->info()->strides_in_bytes().z());
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-
-        const int in_x0_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
-        const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x2_offset = (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-        const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) * static_cast<int>
-                                 (src->info()->strides_in_bytes().z());
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
-            const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
-            const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
-            const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
-            const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
-            const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
-            const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
-            const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
-            vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
-            // Store result
-            vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
 
-            const uint32_t   offset_base  = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t   offset_x0    = offset_base / sizeof(float) + x_off;
-            const uint32_t   offset_x1    = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t   offset_x2    = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t   offset_x3    = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32x4_t voffset_x0   = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
-            const uint32x4_t voffset_x1   = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
-            const uint32x4_t voffset_x2   = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
-            const uint32x4_t voffset_x3   = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
-            const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
-            const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
-            const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
 
-            // Store indices
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
-        }
+            const int in_x0_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x1_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x2_offset =
+                (pool_start_x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
+            const int in_x3_offset =
+                (pool_start_x + 1 - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                (pool_start_y + 1 - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z());
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
-            const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
-            const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
-            const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
-            res           = std::max(std::max(x2, x3), std::max(x0, x1));
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+            {
+                const auto in_x0_ptr = reinterpret_cast<const float *>(in.ptr() + in_x0_offset);
+                const auto in_x1_ptr = reinterpret_cast<const float *>(in.ptr() + in_x1_offset);
+                const auto in_x2_ptr = reinterpret_cast<const float *>(in.ptr() + in_x2_offset);
+                const auto in_x3_ptr = reinterpret_cast<const float *>(in.ptr() + in_x3_offset);
+                const auto v_x0      = vld1q_f32(in_x0_ptr + x_off);
+                const auto v_x1      = vld1q_f32(in_x1_ptr + x_off);
+                const auto v_x2      = vld1q_f32(in_x2_ptr + x_off);
+                const auto v_x3      = vld1q_f32(in_x3_ptr + x_off);
+                vres                 = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
+                // Store result
+                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
 
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
+                const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                      pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
+                const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t offset_x2 =
+                    offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t   offset_x3    = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32x4_t voffset_x0   = {offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3};
+                const uint32x4_t voffset_x1   = {offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3};
+                const uint32x4_t voffset_x2   = {offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3};
+                const uint32x4_t voffset_x3   = {offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3};
+                const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
+                const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
+                const uint32x4_t tmp_indices2 =
+                    vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
 
-            const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NHWC);
-            const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
-            const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t offset_x2   = offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
-            const uint32_t offset_x3   = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
-            const uint32_t tmp_idx0    = (x0 >= x1) ? offset_x0 : offset_x1;
-            const uint32_t tmp_idx1    = (x2 >= x3) ? offset_x2 : offset_x3;
-            const uint32_t tmp_idx2    = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+                // Store indices
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, tmp_indices2);
+            }
 
-            // Store indices
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
-        }
-    },
-    in, out, indices);
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                const auto x0 = *(reinterpret_cast<const float *>(in.ptr() + in_x0_offset) + x_off);
+                const auto x1 = *(reinterpret_cast<const float *>(in.ptr() + in_x1_offset) + x_off);
+                const auto x2 = *(reinterpret_cast<const float *>(in.ptr() + in_x2_offset) + x_off);
+                const auto x3 = *(reinterpret_cast<const float *>(in.ptr() + in_x3_offset) + x_off);
+                res           = std::max(std::max(x2, x3), std::max(x0, x1));
+
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
+
+                const uint32_t offset_base = offset_no_padding<float>(in.offset(), id, *src->info(), pool_stride_x,
+                                                                      pool_stride_y, DataLayout::NHWC);
+                const uint32_t offset_x0   = offset_base / sizeof(float) + x_off;
+                const uint32_t offset_x1   = offset_x0 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t offset_x2 =
+                    offset_x0 + in_stride_z / sizeof(float) - pad_horizontal * src->info()->tensor_shape()[1];
+                const uint32_t offset_x3 = offset_x2 + in_stride_y / sizeof(float) - pad_horizontal;
+                const uint32_t tmp_idx0  = (x0 >= x1) ? offset_x0 : offset_x1;
+                const uint32_t tmp_idx1  = (x2 >= x3) ? offset_x2 : offset_x3;
+                const uint32_t tmp_idx2  = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
+
+                // Store indices
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = tmp_idx2;
+            }
+        },
+        in, out, indices);
 }
 } // namespace
 
-void poolingMxN_fp32_neon_nhwc_kernel_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window)
+void poolingMxN_fp32_neon_nhwc_kernel_indices(
+    const ITensor *src, ITensor *dst0, ITensor *dst1, const PoolingLayerInfo &pool_info, const Window &window)
 {
-    const int window_start_x     = window.x().start();
-    const int window_end_x       = window.x().end();
+    const int     window_start_x = window.x().start();
+    const int     window_end_x   = window.x().end();
     constexpr int window_step_x  = 4;
 
     Window window_out = window;
@@ -160,8 +179,8 @@
     const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left = pool_info.pad_stride_info.pad_left();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
 
     const float min_value = get_initial_min<float>(pool_info.use_inf_as_limit);
@@ -169,9 +188,9 @@
     float32x4_t vres;
     uint32x4_t  vidx;
 
-    constexpr int idx_width   = 1;
-    constexpr int idx_height  = 2;
-    constexpr int idx_batch   = 3;
+    constexpr int idx_width  = 1;
+    constexpr int idx_height = 2;
+    constexpr int idx_batch  = 3;
 
     const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
     const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
@@ -182,89 +201,97 @@
 
     const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-
-        const int pool_start_x = std::max(0, -idx_width);
-        const int pool_start_y = std::max(0, -idx_height);
-
-        const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width);
-        const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride;
-
-        const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride);
-        const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride);
-
-        int x_off = window_start_x;
-
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            vres              = vdupq_n_f32(min_value);
-            vidx              = vdupq_n_u32(0U);
-            const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
-            uint32_t    curr_kernel_index = pool_size_x * pool_start_y;
-            for(int y = pool_start_y; y < pool_end_y; ++y)
-            {
-                const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
-                curr_kernel_index += pool_start_x;
-                for(int x = pool_start_x; x < pool_end_x; ++x)
-                {
-                    const float32x4_t data              = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x));
-                    const uint32x4_t  vidx_curr         = vdupq_n_u32(curr_kernel_index);
-                    const uint32x4_t idxMask = vcgtq_f32(data, vres);
-                    vidx                     = vbslq_u32(idxMask, vidx_curr, vidx);
-                    vres                     = vmaxq_f32(vres, data);
-                    in_ptr_x += y_stride;
-                    curr_kernel_index++;
-                }
-                curr_kernel_index += (pool_size_x - pool_end_x);
-                in_ptr_y += z_stride;
-            }
-            // Store result
-            vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-            vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx);
-        }
+            const int idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            float    res      = min_value;
-            uint32_t idx      = 0U;
-            const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
-            for(int y = pool_start_y; y < pool_end_y; ++y)
+            const int pool_start_x = std::max(0, -idx_width);
+            const int pool_start_y = std::max(0, -idx_height);
+
+            const int pool_end_x = std::min(pool_size_x, input_dim_w - idx_width);
+            const int pool_end_y = std::min(pool_size_y, input_dim_h - idx_height);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[idx_batch] * n_stride;
+
+            const int in_ptr_y_offset = (z_stride * idx_height) + (pool_start_y * z_stride);
+            const int in_ptr_x_offset = (y_stride * idx_width) + (pool_start_x * y_stride);
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
-                const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
-                for(int x = pool_start_x; x < pool_end_x; ++x)
+                vres                             = vdupq_n_f32(min_value);
+                vidx                             = vdupq_n_u32(0U);
+                const uint8_t *in_ptr_y          = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+                uint32_t       curr_kernel_index = pool_size_x * pool_start_y;
+                for (int y = pool_start_y; y < pool_end_y; ++y)
                 {
-                    const float data = *(reinterpret_cast<const float *>(in_ptr_x));
-                    if(data > res)
+                    const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+                    curr_kernel_index += pool_start_x;
+                    for (int x = pool_start_x; x < pool_end_x; ++x)
                     {
-                        idx = pool_size_x * y + x;
-                        res = data;
+                        const float32x4_t data      = vld1q_f32(reinterpret_cast<const float *>(in_ptr_x));
+                        const uint32x4_t  vidx_curr = vdupq_n_u32(curr_kernel_index);
+                        const uint32x4_t  idxMask   = vcgtq_f32(data, vres);
+                        vidx                        = vbslq_u32(idxMask, vidx_curr, vidx);
+                        vres                        = vmaxq_f32(vres, data);
+                        in_ptr_x += y_stride;
+                        curr_kernel_index++;
                     }
-                    in_ptr_x += y_stride;
+                    curr_kernel_index += (pool_size_x - pool_end_x);
+                    in_ptr_y += z_stride;
                 }
-                in_ptr_y += z_stride;
+                // Store result
+                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+                vst1q_u32(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off, vidx);
             }
 
-            // Store result
-            *(reinterpret_cast<float *>(out.ptr()) + x_off)        = res;
-            *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx;
-        }
-    },
-    out, indices);
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                float          res      = min_value;
+                uint32_t       idx      = 0U;
+                const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
+                for (int y = pool_start_y; y < pool_end_y; ++y)
+                {
+                    const uint8_t *in_ptr_x = in_ptr_y + (x_off * sizeof(float));
+                    for (int x = pool_start_x; x < pool_end_x; ++x)
+                    {
+                        const float data = *(reinterpret_cast<const float *>(in_ptr_x));
+                        if (data > res)
+                        {
+                            idx = pool_size_x * y + x;
+                            res = data;
+                        }
+                        in_ptr_x += y_stride;
+                    }
+                    in_ptr_y += z_stride;
+                }
+
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr()) + x_off)        = res;
+                *(reinterpret_cast<uint32_t *>(indices.ptr()) + x_off) = idx;
+            }
+        },
+        out, indices);
 }
 
-void poolingMxN_fp32_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp32_neon_nhwc(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
-    if((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr))
+    if ((pool_info.pool_type == PoolingType::MAX) && pool_info.use_kernel_indices && (dst1 != nullptr))
     {
         poolingMxN_fp32_neon_nhwc_kernel_indices(src, dst0, dst1, pool_info, window);
     }
-    else if(pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX && !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr))
+    else if (pool_info.pool_size == Size2D(2, 2) && pool_info.pool_type == PoolingType::MAX &&
+             !pool_info.pad_stride_info.has_padding() && (dst1 != nullptr))
     {
         pooling2_f32_maxpool_indices(src, dst0, dst1, pool_info, window_src, window);
     }
@@ -280,153 +307,174 @@
         Iterator in(src, window_src);
         Iterator out(dst0, window_out);
 
-        const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-        const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-        const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int       pool_stride_x   = 0;
-        int       pool_stride_y   = 0;
+        const int pool_size_x =
+            pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+        const int pool_size_y =
+            pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+        const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+        const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+        const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+        const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+        int       pool_stride_x                = 0;
+        int       pool_stride_y                = 0;
         std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
         const int   upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
         const int   upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
         const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
         float32x4_t vres;
 
-        execute_window_loop(window_out, [&](const Coordinates & id)
-        {
-            const int idx_width    = id.y() * pool_stride_x;
-            const int idx_height   = id.z() * pool_stride_y;
-            const int pool_limit_y = pool_pad_top - idx_height;
-            const int pool_limit_x = pool_pad_left - idx_width;
-
-            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-            int x_off = window_start_x;
-            for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+        execute_window_loop(
+            window_out,
+            [&](const Coordinates &id)
             {
-                if(pool_info.pool_type != PoolingType::MAX)
+                const int idx_width    = id.y() * pool_stride_x;
+                const int idx_height   = id.z() * pool_stride_y;
+                const int pool_limit_y = pool_pad_top - idx_height;
+                const int pool_limit_x = pool_pad_left - idx_width;
+
+                const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+                const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+                const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+                const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+                int x_off = window_start_x;
+                for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
                 {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                   pool_stride_y);
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
-
-                    // Perform pooling
-                    vres = vdupq_n_f32(0.0f);
-
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    if (pool_info.pool_type != PoolingType::MAX)
                     {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
+                        // Calculate scale
+                        const float scale = calculate_avg_scale_pool2d(
+                            pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                            upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                        const float32x4_t scale_v = vdupq_n_f32(scale);
 
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
+                        // Perform pooling
+                        vres = vdupq_n_f32(0.0f);
+
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
                             {
-                                vres = vmlaq_f32(vres, data, data);
-                            }
-                            else
-                            {
-                                vres = vaddq_f32(vres, data);
+                                const float32x4_t data = vld1q_f32(
+                                    reinterpret_cast<const float *>(
+                                        in.ptr() +
+                                        (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                        (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                    x_off);
+
+                                // Get power of 2 in case of l2 pooling and accumulate
+                                if (pool_info.pool_type == PoolingType::L2)
+                                {
+                                    vres = vmlaq_f32(vres, data, data);
+                                }
+                                else
+                                {
+                                    vres = vaddq_f32(vres, data);
+                                }
                             }
                         }
+                        // Divide by scale
+                        vres = vmulq_f32(vres, scale_v);
                     }
-                    // Divide by scale
-                    vres = vmulq_f32(vres, scale_v);
-                }
-                else
-                {
-                    vres = vdupq_n_f32(min_value);
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    else
                     {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
+                        vres = vdupq_n_f32(min_value);
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
                         {
-                            const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                               (src->info()->strides_in_bytes().z())) + x_off);
-                            vres                   = vmaxq_f32(vres, data);
-                        }
-                    }
-                }
-
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    float32x4_t l2_res = { static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
-                                           static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
-                                         };
-                    vres = l2_res;
-                }
-
-                // Store result
-                vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
-            }
-
-            // Left-overs loop
-            for(; x_off < window_end_x; ++x_off)
-            {
-                float res = 0.0f;
-
-                if(pool_info.pool_type != PoolingType::MAX)
-                {
-                    // Calculate scale
-                    const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                                   pool_stride_y);
-
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
-                    {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
-                        {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-
-                            // Get power of 2 in case of l2 pooling and accumulate
-                            if(pool_info.pool_type == PoolingType::L2)
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
                             {
-                                res += data * data;
-                            }
-                            else
-                            {
-                                res += data;
+                                const float32x4_t data = vld1q_f32(
+                                    reinterpret_cast<const float *>(
+                                        in.ptr() +
+                                        (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                        (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                    x_off);
+                                vres = vmaxq_f32(vres, data);
                             }
                         }
                     }
 
-                    // Divide by scale
-                    res *= scale;
-                }
-                else
-                {
-                    res = min_value;
-                    for(int y = pool_start_y; y < pool_end_y; ++y)
+                    // Calculate square-root in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
                     {
-                        for(int x = pool_start_x; x < pool_end_x; ++x)
+                        float32x4_t l2_res = {static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
+                                              static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))};
+                        vres               = l2_res;
+                    }
+
+                    // Store result
+                    vst1q_f32(reinterpret_cast<float *>(out.ptr()) + x_off, vres);
+                }
+
+                // Left-overs loop
+                for (; x_off < window_end_x; ++x_off)
+                {
+                    float res = 0.0f;
+
+                    if (pool_info.pool_type != PoolingType::MAX)
+                    {
+                        // Calculate scale
+                        const float scale = calculate_avg_scale_pool2d(
+                            pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                            upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
                         {
-                            const float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                 (src->info()->strides_in_bytes().z())) + x_off);
-                            res              = std::max(res, data);
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
+                            {
+                                const float data =
+                                    *(reinterpret_cast<const float *>(
+                                          in.ptr() +
+                                          (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                          (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                      x_off);
+
+                                // Get power of 2 in case of l2 pooling and accumulate
+                                if (pool_info.pool_type == PoolingType::L2)
+                                {
+                                    res += data * data;
+                                }
+                                else
+                                {
+                                    res += data;
+                                }
+                            }
+                        }
+
+                        // Divide by scale
+                        res *= scale;
+                    }
+                    else
+                    {
+                        res = min_value;
+                        for (int y = pool_start_y; y < pool_end_y; ++y)
+                        {
+                            for (int x = pool_start_x; x < pool_end_x; ++x)
+                            {
+                                const float data =
+                                    *(reinterpret_cast<const float *>(
+                                          in.ptr() +
+                                          (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                          (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                      x_off);
+                                res = std::max(res, data);
+                            }
                         }
                     }
-                }
 
-                // Calculate square-root in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
-                {
-                    res = std::sqrt(res);
-                }
+                    // Calculate square-root in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
+                    {
+                        res = std::sqrt(res);
+                    }
 
-                // Store result
-                *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
-            }
-        },
-        in, out);
+                    // Store result
+                    *(reinterpret_cast<float *>(out.ptr()) + x_off) = res;
+                }
+            },
+            in, out);
     }
 }
 } // namespace cpu

diff --git a/src/cpu/kernels/pool2d/neon/list.h b/src/cpu/kernels/pool2d/neon/list.h
index eb141d6..f8f458a 100644
--- a/src/cpu/kernels/pool2d/neon/list.h
+++ b/src/cpu/kernels/pool2d/neon/list.h

@@ -26,16 +26,19 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/cpu/kernels/pool2d/neon/quantized.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_POOLING_KERNEL(func_name) \
-    void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, const Window &window)
+#define DECLARE_POOLING_KERNEL(func_name)                                                                           \
+    void func_name(const ITensor *src0, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &, const Window &window_src, \
+                   const Window &window)
 
 DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_neon_nhwc);
 DECLARE_POOLING_KERNEL(poolingMxN_qasymm8_signed_neon_nhwc);
@@ -65,7 +68,12 @@
 }
 
 template <typename T>
-inline uint32_t offset_no_padding(uint32_t padded_offset, const Coordinates &id, const ITensorInfo &info, int pool_stride_x, int pool_stride_y, DataLayout data_layout)
+inline uint32_t offset_no_padding(uint32_t           padded_offset,
+                                  const Coordinates &id,
+                                  const ITensorInfo &info,
+                                  int                pool_stride_x,
+                                  int                pool_stride_y,
+                                  DataLayout         data_layout)
 {
     const int pad_left    = info.padding().left;
     const int pad_right   = info.padding().right;
@@ -76,22 +84,24 @@
     const int pad_horiz   = pad_left + pad_right;
     const int pad_vert    = pad_top + pad_bottom;
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_y                                            /* subtract padding elems per row */
-                                     - pad_top * sizeof(T)                                                                       /* top padding */
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() - pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
-                                     - in_stride_w * id[3];
+        const uint32_t offset_base =
+            padded_offset - sizeof(T) * pad_horiz * id.y() * pool_stride_y /* subtract padding elems per row */
+            - pad_top * sizeof(T)                                          /* top padding */
+            - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() -
+            pad_vert * in_stride_y * id.z() /* for each Z plane there are height*pad_right padding elems */
+            - in_stride_w * id[3];
 
         return offset_base;
     }
     else
     {
-        const uint32_t offset_base = padded_offset
-                                     - sizeof(T) * pad_horiz * id.y() * pool_stride_x                          // subtract padding elems per row
-                                     - pad_top * sizeof(T)                                                     // top padding
-                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() * pool_stride_y // for each Z plane there are width*pad_right padding elems
+        const uint32_t offset_base = padded_offset -
+                                     sizeof(T) * pad_horiz * id.y() * pool_stride_x // subtract padding elems per row
+                                     - pad_top * sizeof(T)                          // top padding
+                                     - sizeof(T) * pad_horiz * info.tensor_shape()[1] * id.z() *
+                                           pool_stride_y // for each Z plane there are width*pad_right padding elems
                                      - in_stride_w * id[3];
 
         return offset_base;
@@ -100,4 +110,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_POOLING_LIST_H

diff --git a/src/cpu/kernels/pool2d/neon/nchw/all.cpp b/src/cpu/kernels/pool2d/neon/nchw/all.cpp
index c342b96..ee4a67b 100644
--- a/src/cpu/kernels/pool2d/neon/nchw/all.cpp
+++ b/src/cpu/kernels/pool2d/neon/nchw/all.cpp

@@ -25,9 +25,11 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
+
 #include <limits>
 
 #ifdef ENABLE_NCHW_KERNELS
@@ -38,15 +40,19 @@
 #define READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
     (x == width + pad_left - 1) ? vset_lane_f32(*(ptr), vdup_n_f32(fval), 0) : vld1_f32(ptr)
 #define READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
-    (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1) : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
-#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval) \
-    ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) ? vdup_n_f32(fval) : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+    (x == pad_left - 1) ? vset_lane_f32(*(1 + ptr), vdup_n_f32(fval), 1)              \
+                        : READ_2_RIGHT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
+#define READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)                   \
+    ((y < pad_top) || (x < pad_left - 1) || (y >= height + pad_top) || (x > width + pad_left - 1)) \
+        ? vdup_n_f32(fval)                                                                         \
+        : READ_2_LEFT_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)
 
 #define READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval)           \
     vcombine_f32(READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval), \
                  READ_2_BOUNDARY_AWARE(height, width, pad_left, pad_top, (x + 2), y, (ptr + 2), fval))
 
-float32x4x2_t read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
+float32x4x2_t
+read_8_boundary_aware(int height, int width, int pad_left, int pad_top, int x, int y, const float *ptr, float fval)
 {
     float32x4x2_t vec;
     vec.val[0] = READ_4_BOUNDARY_AWARE(height, width, pad_left, pad_top, x, y, ptr, fval);
@@ -56,13 +62,14 @@
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
-float16x4_t read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
+float16x4_t
+read_4_boundary_aware_fp16(int srcw, int srch, int pad_l, int pad_t, int x, int y, const float16_t *ptr, float16_t fval)
 {
     float16_t  vec[4];
     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
-    for(int i = 0; i < 4; i++)
+    for (int i = 0; i < 4; i++)
     {
-        if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
         {
             vec[i] = *(ptr + i);
         }
@@ -74,94 +81,106 @@
     return wrapper::vload(vec);
 }
 
-void pooling3_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling3_fp16_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
 
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int                  src_w          = src->info()->dimension(0);
-    const int                  src_h          = src->info()->dimension(1);
-    const int                  upper_bound_w  = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int                  upper_bound_h  = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float16_t            fp16_min       = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-    const float16_t            fill_value     = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
-    const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const unsigned char *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+    constexpr const int pool_size            = 3;
+    const int           pool_pad_right       = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top         = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left        = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom      = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x        = 0;
+    int                 pool_stride_y        = 0;
+    std::tie(pool_stride_x, pool_stride_y)   = pool_info.pad_stride_info.stride();
+    const int                  src_w         = src->info()->dimension(0);
+    const int                  src_h         = src->info()->dimension(1);
+    const int                  upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int                  upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t            fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    const float16_t            fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.f;
+    const unsigned char *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const unsigned char *const src_middle_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const unsigned char *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto  x_val    = id.x() * pool_stride_x;
-        const auto  y_val_0  = id.y() * pool_stride_y;
-        const auto  y_val_1  = (id.y() * pool_stride_y) + 1;
-        const auto  y_val_2  = (id.y() * pool_stride_y) + 2;
-        float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                          x_val, y_val_0, reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
-        float16x4_t middle_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                             x_val, y_val_1, reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
-        float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                             x_val, y_val_2, reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
-        float16x4_t res = {};
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            top_data    = vmul_f16(top_data, top_data);
-            middle_data = vmul_f16(middle_data, middle_data);
-            bottom_data = vmul_f16(bottom_data, bottom_data);
-        }
+            const auto  x_val   = id.x() * pool_stride_x;
+            const auto  y_val_0 = id.y() * pool_stride_y;
+            const auto  y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto  y_val_2 = (id.y() * pool_stride_y) + 2;
+            float16x4_t top_data =
+                read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                           reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()), fill_value);
+            float16x4_t middle_data = read_4_boundary_aware_fp16(
+                src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()), fill_value);
+            float16x4_t bottom_data = read_4_boundary_aware_fp16(
+                src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_2,
+                reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()), fill_value);
+            float16x4_t res = {};
 
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                           pool_stride_y);
-            const float16x4_t scale_v = vdup_n_f16(scale);
-            // Perform pooling
-            const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
-            res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
-            res                        = vmul_f16(vpadd_f16(res, res), scale_v);
-        }
-        else
-        {
-            const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
-            res                        = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
-            res                        = vpmax_f16(res, res);
-        }
+            // Get power of 2 in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                top_data    = vmul_f16(top_data, top_data);
+                middle_data = vmul_f16(middle_data, middle_data);
+                bottom_data = vmul_f16(bottom_data, bottom_data);
+            }
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = vsqrt_f16(res);
-        }
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h,
+                    pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                const float16x4_t scale_v = vdup_n_f16(scale);
+                // Perform pooling
+                const float16x4_t sum_data = vadd_f16(vadd_f16(top_data, bottom_data), middle_data);
+                res                        = vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
+                res                        = vmul_f16(vpadd_f16(res, res), scale_v);
+            }
+            else
+            {
+                const float16x4_t max_data = vmax_f16(vmax_f16(top_data, bottom_data), middle_data);
+                res                        = vpmax_f16(vset_lane_f16(fp16_min, max_data, 3), max_data);
+                res                        = vpmax_f16(res, res);
+            }
 
-        *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
-    },
-    in, out);
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = vsqrt_f16(res);
+            }
+
+            *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+        },
+        in, out);
 }
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type
-f16_to_f32(float16x4_t in)
+inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>::type f16_to_f32(float16x4_t in)
 {
-    float32x2_t out = { static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
+    float32x2_t out = {static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1))};
     return out;
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
 template <typename T>
-inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type
-f16_to_f32(float32x2_t in)
+inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>::type f16_to_f32(float32x2_t in)
 {
     return in;
 }
@@ -171,9 +190,9 @@
 {
     T          vec[2];
     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
-    for(int i = 0; i < 2; i++)
+    for (int i = 0; i < 2; i++)
     {
-        if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
         {
             vec[i] = *(ptr + i);
         }
@@ -186,61 +205,80 @@
 }
 
 template <typename T>
-void pooling2_nchw_maxpool_indices(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_nchw_maxpool_indices(const ITensor    *src,
+                                   ITensor          *dst0,
+                                   ITensor          *dst1,
+                                   PoolingLayerInfo &pool_info,
+                                   const Window     &window_src,
+                                   const Window     &window)
 {
     Iterator  in(src, window_src);
     Iterator  out(dst0, window);
     Iterator  indices(dst1, window);
-    const int pool_pad_top  = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left = pool_info.pad_stride_info.pad_left();
-    int       pool_stride_x = 0;
-    int       pool_stride_y = 0;
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int            src_w          = src->info()->dimension(0);
-    const int            src_h          = src->info()->dimension(1);
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const int            pad_left       = src->info()->padding().left;
-    const int            pad_right      = src->info()->padding().right;
-    const int            in_stride_y    = static_cast<int>(src->info()->strides_in_bytes().y());
-    const T              float_min      = get_initial_min<T>(pool_info.use_inf_as_limit);
-    const T              fill_value     = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
+    const int            src_w             = src->info()->dimension(0);
+    const int            src_h             = src->info()->dimension(1);
+    const uint8_t *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const int pad_left    = src->info()->padding().left;
+    const int pad_right   = src->info()->padding().right;
+    const int in_stride_y = static_cast<int>(src->info()->strides_in_bytes().y());
+    const T   float_min   = get_initial_min<T>(pool_info.use_inf_as_limit);
+    const T   fill_value  = (pool_info.pool_type == PoolingType::MAX) ? float_min : 0.f;
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto x_val    = id.x() * pool_stride_x;
-        const auto y_val_0  = id.y() * pool_stride_y;
-        const auto y_val_1  = (id.y() * pool_stride_y) + 1;
-        auto       top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                    x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
-        auto bottom_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                 x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
-        float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
-        float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
+        {
+            const auto x_val    = id.x() * pool_stride_x;
+            const auto y_val_0  = id.y() * pool_stride_y;
+            const auto y_val_1  = (id.y() * pool_stride_y) + 1;
+            auto       top_data = read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                                        reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto       bottom_data =
+                read_2_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                      reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+            float32x2_t top_data_f32    = f16_to_f32<T>(top_data);
+            float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
 
-        // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
-        const float32x2_t max_data_top      = vpmax_f32(top_data_f32, top_data_f32);
-        const float32x2_t max_data_bottom   = vpmax_f32(bottom_data_f32, bottom_data_f32);
-        const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
-        *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
+            // Calculate max data, compare top first, then bottom, to make sue the first max is recorded.
+            const float32x2_t max_data_top      = vpmax_f32(top_data_f32, top_data_f32);
+            const float32x2_t max_data_bottom   = vpmax_f32(bottom_data_f32, bottom_data_f32);
+            const float32x2_t max_data          = vmax_f32(max_data_top, max_data_bottom);
+            *(reinterpret_cast<T *>(out.ptr())) = static_cast<T>(vget_lane_f32(max_data, 0));
 
-        // Calculate max data indice, which will be used in max unpool.
-        const uint32_t   offset_base              = offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
-        const uint32_t   offset_top               = (uint32_t)(offset_base / sizeof(T));
-        const uint32_t   offset_bottom            = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
-        const uint32x2_t voffset_top              = { offset_top, offset_top + 1u };
-        const uint32x2_t voffset_bottom           = { offset_bottom, offset_bottom + 1u };
-        const uint32x2_t tmp_indices_top          = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
-        const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
-        *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
-    },
-    in, out, indices);
+            // Calculate max data indice, which will be used in max unpool.
+            const uint32_t offset_base =
+                offset_no_padding<T>(in.offset(), id, *src->info(), pool_stride_x, pool_stride_y, DataLayout::NCHW);
+            const uint32_t   offset_top     = (uint32_t)(offset_base / sizeof(T));
+            const uint32_t   offset_bottom  = offset_top + in_stride_y / sizeof(T) - pad_right - pad_left;
+            const uint32x2_t voffset_top    = {offset_top, offset_top + 1u};
+            const uint32x2_t voffset_bottom = {offset_bottom, offset_bottom + 1u};
+            const uint32x2_t tmp_indices_top =
+                vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
+            const uint32x2_t tmp_indices_bottom       = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)),
+                                                                 voffset_bottom, vrev64_u32(voffset_bottom));
+            *(reinterpret_cast<int *>(indices.ptr())) = vget_lane_u32(
+                vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
+        },
+        in, out, indices);
 }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-void pooling2_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_fp16_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
+    if (pool_info.pool_type == PoolingType::MAX && dst1)
     {
         pooling2_nchw_maxpool_indices<float16_t>(src, dst0, dst1, pool_info, window_src, window);
     }
@@ -254,244 +292,274 @@
         const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
         const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
         int           pool_stride_x, pool_stride_y = 0;
-        std::tie(pool_stride_x, pool_stride_y)     = pool_info.pad_stride_info.stride();
-        const int       src_w         = src->info()->dimension(0);
-        const int       src_h         = src->info()->dimension(1);
-        const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-        const float16_t fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-        const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+        std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+        const int       src_w                  = src->info()->dimension(0);
+        const int       src_h                  = src->info()->dimension(1);
+        const int       upper_bound_w          = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int       upper_bound_h          = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float16_t fp16_min               = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+        const float16_t fill_value             = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
 
-        const unsigned char *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const unsigned char *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+        const unsigned char *const src_top_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const unsigned char *const src_bottom_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto in_top_ptr    = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
-            const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
-
-            const auto  x_val    = id.x() * pool_stride_x;
-            const auto  y_val_0  = id.y() * pool_stride_y;
-            const auto  y_val_1  = (id.y() * pool_stride_y) + 1;
-            float16x4_t top_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                              x_val, y_val_0, in_top_ptr, fill_value);
-            float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top,
-                                                                 x_val, y_val_1, in_bottom_ptr, fill_value);
-            float16x4_t res = {};
-
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
             {
-                top_data    = vmul_f16(top_data, top_data);
-                bottom_data = vmul_f16(bottom_data, bottom_data);
-            }
+                const auto in_top_ptr    = reinterpret_cast<const float16_t *>(src_top_ptr + in.offset());
+                const auto in_bottom_ptr = reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset());
 
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-                const float16x4_t scale_v = vdup_n_f16(scale);
+                const auto  x_val       = id.x() * pool_stride_x;
+                const auto  y_val_0     = id.y() * pool_stride_y;
+                const auto  y_val_1     = (id.y() * pool_stride_y) + 1;
+                float16x4_t top_data    = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+                                                                     y_val_0, in_top_ptr, fill_value);
+                float16x4_t bottom_data = read_4_boundary_aware_fp16(src_w, src_h, pool_pad_left, pool_pad_top, x_val,
+                                                                     y_val_1, in_bottom_ptr, fill_value);
+                float16x4_t res         = {};
 
-                const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
-                res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
-            }
-            else
-            {
-                const float16x4_t max_data = vmax_f16(top_data, bottom_data);
-                res                        = vpmax_f16(max_data, max_data);
-            }
+                // Get power of 2 in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    top_data    = vmul_f16(top_data, top_data);
+                    bottom_data = vmul_f16(bottom_data, bottom_data);
+                }
 
-            // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                res = vsqrt_f16(res);
-            }
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float16x4_t scale_v = vdup_n_f16(scale);
 
-            // Store result
-            *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
-        },
-        in, out);
+                    const float16x4_t sum_data = vadd_f16(top_data, bottom_data);
+                    res                        = vmul_f16(vpadd_f16(sum_data, sum_data), scale_v);
+                }
+                else
+                {
+                    const float16x4_t max_data = vmax_f16(top_data, bottom_data);
+                    res                        = vpmax_f16(max_data, max_data);
+                }
+
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    res = vsqrt_f16(res);
+                }
+
+                // Store result
+                *(reinterpret_cast<float16_t *>(out.ptr())) = vget_lane_f16(res, 0);
+            },
+            in, out);
     }
 }
 
-void poolingMxN_fp16_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp16_neon_nchw(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int       src_w         = src->info()->dimension(0);
-    const int       src_h         = src->info()->dimension(1);
-    const int       upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int       upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float16_t fp16_min      = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
-    const float16_t fill_value    = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
+    const int       src_w                  = src->info()->dimension(0);
+    const int       src_h                  = src->info()->dimension(1);
+    const int       upper_bound_w          = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int       upper_bound_h          = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float16_t fp16_min               = get_initial_min<half_float::half>(pool_info.use_inf_as_limit);
+    const float16_t fill_value             = (pool_info.pool_type == PoolingType::MAX) ? fp16_min : 0.0f;
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float16_t res = 0.0f;
-
-        if(pool_info.pool_type != PoolingType::MAX)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Calculate scale
-            const float16_t scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
+            float16_t res = 0.0f;
 
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                for(int x = 0; x < pool_size_x; ++x)
+                // Calculate scale
+                const float16_t scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                         + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
-
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
-
-                    if(pool_info.pool_type == PoolingType::L2)
+                    for (int x = 0; x < pool_size_x; ++x)
                     {
-                        data *= data;
+                        const auto ptr = reinterpret_cast<const float16_t *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+
+                        if (pool_info.pool_type == PoolingType::L2)
+                        {
+                            data *= data;
+                        }
+
+                        res += data;
                     }
-
-                    res += data;
                 }
+
+                // Divide by scale
+                res *= scale;
             }
-
-            // Divide by scale
-            res *= scale;
-        }
-        else // if max pooling
-        {
-            res = fp16_min;
-
-            for(int y = 0; y < pool_size_y; ++y)
+            else // if max pooling
             {
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    const auto ptr = reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                         + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+                res = fp16_min;
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
-                    res            = std::max(res, data);
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float16_t *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float16_t data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        res            = std::max(res, data);
+                    }
                 }
             }
-        }
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = std::sqrt(res);
+            }
 
-        // Store result
-        *(reinterpret_cast<float16_t *>(out.ptr())) = res;
-    },
-    in, out);
+            // Store result
+            *(reinterpret_cast<float16_t *>(out.ptr())) = res;
+        },
+        in, out);
 }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
 
-void poolingMxN_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_fp32_neon_nchw(const ITensor    *src,
+                               ITensor          *dst0,
+                               ITensor          *dst1,
+                               PoolingLayerInfo &pool_info,
+                               const Window     &window_src,
+                               const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int   src_w         = src->info()->dimension(0);
-    const int   src_h         = src->info()->dimension(1);
-    const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        float res = 0.0f;
-
-        if(pool_info.pool_type != PoolingType::MAX)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Calculate scale
-            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h,
-                                                           pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+            float res = 0.0f;
 
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                for(int x = 0; x < pool_size_x; ++x)
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
                 {
-                    const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                     + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
-
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
-
-                    if(pool_info.pool_type == PoolingType::L2)
+                    for (int x = 0; x < pool_size_x; ++x)
                     {
-                        data *= data;
+                        const auto ptr = reinterpret_cast<const float *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+
+                        if (pool_info.pool_type == PoolingType::L2)
+                        {
+                            data *= data;
+                        }
+
+                        res += data;
                     }
-
-                    res += data;
                 }
+
+                // Divide by scale
+                res *= scale;
             }
-
-            // Divide by scale
-            res *= scale;
-        }
-        else // if max pooling
-        {
-            res = min_value;
-
-            for(int y = 0; y < pool_size_y; ++y)
+            else // if max pooling
             {
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    const auto ptr = reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
-                                                                     + (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+                res = min_value;
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
-                    res            = std::max(res, data);
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto ptr = reinterpret_cast<const float *>(
+                            in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) +
+                            (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().y()));
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        float     data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *ptr;
+                        res            = std::max(res, data);
+                    }
                 }
             }
-        }
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            res = std::sqrt(res);
-        }
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                res = std::sqrt(res);
+            }
 
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = res;
-    },
-    in, out);
+            // Store result
+            *(reinterpret_cast<float *>(out.ptr())) = res;
+        },
+        in, out);
 }
 
-void pooling2_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
-    if(pool_info.pool_type == PoolingType::MAX && dst1)
+    if (pool_info.pool_type == PoolingType::MAX && dst1)
     {
         pooling2_nchw_maxpool_indices<float>(src, dst0, dst1, pool_info, window_src, window);
     }
@@ -499,64 +567,168 @@
     {
         Iterator      in(src, window_src);
         Iterator      out(dst0, window);
-        constexpr int pool_size       = 2;
-        const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-        const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-        const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-        const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-        int           pool_stride_x   = 0;
-        int           pool_stride_y   = 0;
+        constexpr int pool_size                = 2;
+        const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+        const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+        const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+        const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
+        int           pool_stride_x            = 0;
+        int           pool_stride_y            = 0;
         std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-        const int   src_w         = src->info()->dimension(0);
-        const int   src_h         = src->info()->dimension(1);
-        const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-        const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-        const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-        const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+        const int   src_w                      = src->info()->dimension(0);
+        const int   src_h                      = src->info()->dimension(1);
+        const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+        const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+        const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+        const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
 
-        const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-        const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+        const uint8_t *const src_top_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+        const uint8_t *const src_bottom_ptr =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
 
-        execute_window_loop(window, [&](const Coordinates & id)
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+                const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
+
+                const auto x_val      = id.x() * pool_stride_x;
+                const auto y_val_0    = id.y() * pool_stride_y;
+                const auto y_val_1    = (id.y() * pool_stride_y) + 1;
+                auto       top_data   = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0,
+                                                              in_top_ptr, fill_value);
+                auto bottom_data      = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                                              in_bottom_ptr, fill_value);
+                float32x2_t res       = {};
+                float       final_res = 0;
+
+                // Get power of 2 in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    top_data    = vmul_f32(top_data, top_data);
+                    bottom_data = vmul_f32(bottom_data, bottom_data);
+                }
+
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    // Calculate scale
+                    float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                             pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                             pool_pad_top, pool_stride_x, pool_stride_y);
+                    const float32x2_t scale_v = vdup_n_f32(scale);
+
+                    // Perform pooling
+                    const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
+                    res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+                }
+                else
+                {
+                    const float32x2_t max_data = vmax_f32(top_data, bottom_data);
+                    res                        = vpmax_f32(max_data, max_data);
+                }
+                final_res = vget_lane_f32(res, 0);
+
+                // Calculate square-root in case of l2 pooling
+                if (pool_info.pool_type == PoolingType::L2)
+                {
+                    final_res = sqrt(final_res);
+                }
+
+                // Store result
+                *(reinterpret_cast<float *>(out.ptr())) = final_res;
+            },
+            in, out);
+    }
+}
+
+void pooling3_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
+{
+    ARM_COMPUTE_UNUSED(dst1);
+    Iterator in(src, window_src);
+    Iterator out(dst0, window);
+
+    constexpr const int pool_size          = 3;
+    const int           pool_pad_right     = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top       = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left      = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom    = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
+    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+
+    const uint8_t *const src_top_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
+    const uint8_t *const src_middle_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
+    const uint8_t *const src_bottom_ptr =
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
+
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
             const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
+            const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
             const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
 
-            const auto  x_val       = id.x() * pool_stride_x;
-            const auto  y_val_0     = id.y() * pool_stride_y;
-            const auto  y_val_1     = (id.y() * pool_stride_y) + 1;
-            auto        top_data    = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
-            auto        bottom_data = READ_2_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_bottom_ptr, fill_value);
-            float32x2_t res         = {};
-            float       final_res   = 0;
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto y_val_2 = (id.y() * pool_stride_y) + 2;
+            auto top_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr,
+                                                  fill_value);
+            auto middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1,
+                                                     in_middle_ptr, fill_value);
+            auto bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2,
+                                                     in_bottom_ptr, fill_value);
+
+            float32x2_t res       = {};
+            float       final_res = 0;
 
             // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+            if (pool_info.pool_type == PoolingType::L2)
             {
-                top_data    = vmul_f32(top_data, top_data);
-                bottom_data = vmul_f32(bottom_data, bottom_data);
+                top_data    = vmulq_f32(top_data, top_data);
+                middle_data = vmulq_f32(middle_data, middle_data);
+                bottom_data = vmulq_f32(bottom_data, bottom_data);
             }
 
-            if(pool_info.pool_type != PoolingType::MAX)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
                 // Calculate scale
-                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                         pool_stride_y);
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                         pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                         pool_pad_top, pool_stride_x, pool_stride_y);
                 const float32x2_t scale_v = vdup_n_f32(scale);
 
                 // Perform pooling
-                const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
-                res                        = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
+                const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
+                res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
+                res = vmul_f32(vpadd_f32(res, res), scale_v);
             }
             else
             {
-                const float32x2_t max_data = vmax_f32(top_data, bottom_data);
-                res                        = vpmax_f32(max_data, max_data);
+                const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
+                res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data));
+                res = vpmax_f32(res, res);
             }
             final_res = vget_lane_f32(res, 0);
 
             // Calculate square-root in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
+            if (pool_info.pool_type == PoolingType::L2)
             {
                 final_res = sqrt(final_res);
             }
@@ -565,191 +737,120 @@
             *(reinterpret_cast<float *>(out.ptr())) = final_res;
         },
         in, out);
-    }
 }
 
-void pooling3_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling7_fp32_neon_nchw(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
     Iterator out(dst0, window);
 
-    constexpr const int pool_size       = 3;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
+    constexpr const int pool_size          = 7;
+    const int           pool_pad_right     = pool_info.pad_stride_info.pad_right();
+    const int           pool_pad_top       = pool_info.pad_stride_info.pad_top();
+    const int           pool_pad_left      = pool_info.pad_stride_info.pad_left();
+    const int           pool_pad_bottom    = pool_info.pad_stride_info.pad_bottom();
+    int                 pool_stride_x      = 0;
+    int                 pool_stride_y      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int   src_w         = src->info()->dimension(0);
-    const int   src_h         = src->info()->dimension(1);
-    const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
+    const int   src_w                      = src->info()->dimension(0);
+    const int   src_h                      = src->info()->dimension(1);
+    const int   upper_bound_w              = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int   upper_bound_h              = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const float min_value                  = get_initial_min<float>(pool_info.use_inf_as_limit);
+    const float fill_value                 = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
 
-    const uint8_t *const src_top_ptr    = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
-    const uint8_t *const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
-    const uint8_t *const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
-
-    execute_window_loop(window, [&](const Coordinates & id)
+    std::array<const uint8_t *, pool_size> src_ptrs{{}};
+    for (int i = 0; i < pool_size; ++i)
     {
-        const auto in_top_ptr    = reinterpret_cast<const float *>(src_top_ptr + in.offset());
-        const auto in_middle_ptr = reinterpret_cast<const float *>(src_middle_ptr + in.offset());
-        const auto in_bottom_ptr = reinterpret_cast<const float *>(src_bottom_ptr + in.offset());
-
-        const auto x_val       = id.x() * pool_stride_x;
-        const auto y_val_0     = id.y() * pool_stride_y;
-        const auto y_val_1     = (id.y() * pool_stride_y) + 1;
-        const auto y_val_2     = (id.y() * pool_stride_y) + 2;
-        auto       top_data    = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_0, in_top_ptr, fill_value);
-        auto       middle_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_1, in_middle_ptr, fill_value);
-        auto       bottom_data = READ_4_BOUNDARY_AWARE(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val_2, in_bottom_ptr, fill_value);
-
-        float32x2_t res       = {};
-        float       final_res = 0;
-
-        // Get power of 2 in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            top_data    = vmulq_f32(top_data, top_data);
-            middle_data = vmulq_f32(middle_data, middle_data);
-            bottom_data = vmulq_f32(bottom_data, bottom_data);
-        }
-
-        if(pool_info.pool_type != PoolingType::MAX)
-        {
-            // Calculate scale
-            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                     pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
-
-            // Perform pooling
-            const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
-            res                        = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
-            res                        = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
-            res                        = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, max_data, 3)), vget_low_f32(max_data));
-            res                        = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
-
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
-
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
-}
-
-void pooling7_fp32_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
-{
-    ARM_COMPUTE_UNUSED(dst1);
-    Iterator in(src, window_src);
-    Iterator out(dst0, window);
-
-    constexpr const int pool_size       = 7;
-    const int           pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int           pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int           pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int           pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int   src_w         = src->info()->dimension(0);
-    const int   src_h         = src->info()->dimension(1);
-    const int   upper_bound_w = src_w + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int   upper_bound_h = src_h + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const float min_value     = get_initial_min<float>(pool_info.use_inf_as_limit);
-    const float fill_value    = (pool_info.pool_type == PoolingType::MAX) ? min_value : 0.0f;
-
-    std::array<const uint8_t *, pool_size> src_ptrs{ {} };
-    for(int i = 0; i < pool_size; ++i)
-    {
-        src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
+        src_ptrs[i] =
+            src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
     }
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
-
-        auto          x_val = id.x() * pool_stride_x;
-        auto          y_val = id.y() * pool_stride_y;
-        float32x4x2_t data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
-
-        float32x2_t res       = {};
-        float       final_res = 0.f;
-
-        if(pool_info.pool_type != PoolingType::MAX)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Calculate scale
-            float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                     pool_stride_y);
-            const float32x2_t scale_v = vdup_n_f32(scale);
+            auto in_ptr = reinterpret_cast<const float *>(src_ptrs[0] + in.offset());
 
-            // Get power of 2 in case of l2 pooling
-            if(pool_info.pool_type == PoolingType::L2)
-            {
-                data.val[0] = vmulq_f32(data.val[0], data.val[0]);
-                data.val[1] = vmulq_f32(data.val[1], data.val[1]);
-            }
-            float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
-            for(int i = 1; i < pool_size; ++i)
-            {
-                in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+            auto          x_val = id.x() * pool_stride_x;
+            auto          y_val = id.y() * pool_stride_y;
+            float32x4x2_t data =
+                read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
 
-                x_val = id.x() * pool_stride_x;
-                y_val = (id.y() * pool_stride_y) + i;
-                data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
+            float32x2_t res       = {};
+            float       final_res = 0.f;
+
+            if (pool_info.pool_type != PoolingType::MAX)
+            {
+                // Calculate scale
+                float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size,
+                                                         pool_size, upper_bound_w, upper_bound_h, pool_pad_left,
+                                                         pool_pad_top, pool_stride_x, pool_stride_y);
+                const float32x2_t scale_v = vdup_n_f32(scale);
+
                 // Get power of 2 in case of l2 pooling
-                if(pool_info.pool_type == PoolingType::L2)
+                if (pool_info.pool_type == PoolingType::L2)
                 {
                     data.val[0] = vmulq_f32(data.val[0], data.val[0]);
                     data.val[1] = vmulq_f32(data.val[1], data.val[1]);
                 }
-                sum_data = vaddq_f32(sum_data, data.val[0]);
-                sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+                float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
+                for (int i = 1; i < pool_size; ++i)
+                {
+                    in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+
+                    x_val = id.x() * pool_stride_x;
+                    y_val = (id.y() * pool_stride_y) + i;
+                    data  = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr,
+                                                  fill_value);
+                    // Get power of 2 in case of l2 pooling
+                    if (pool_info.pool_type == PoolingType::L2)
+                    {
+                        data.val[0] = vmulq_f32(data.val[0], data.val[0]);
+                        data.val[1] = vmulq_f32(data.val[1], data.val[1]);
+                    }
+                    sum_data = vaddq_f32(sum_data, data.val[0]);
+                    sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
+                }
+                res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
+                res = vmul_f32(vpadd_f32(res, res), scale_v);
             }
-            res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
-            res = vmul_f32(vpadd_f32(res, res), scale_v);
-        }
-        else
-        {
-            for(int i = 1; i < pool_size; ++i)
+            else
             {
-                in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
+                for (int i = 1; i < pool_size; ++i)
+                {
+                    in_ptr = reinterpret_cast<const float *>(src_ptrs[i] + in.offset());
 
-                x_val              = id.x() * pool_stride_x;
-                y_val              = (id.y() * pool_stride_y) + i;
-                float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val, in_ptr, fill_value);
-                data               = vmax2q_f32(data, temp);
+                    x_val              = id.x() * pool_stride_x;
+                    y_val              = (id.y() * pool_stride_y) + i;
+                    float32x4x2_t temp = read_8_boundary_aware(src_h, src_w, pool_pad_left, pool_pad_top, x_val, y_val,
+                                                               in_ptr, fill_value);
+                    data               = vmax2q_f32(data, temp);
+                }
+                res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1]));
+                res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
+                res = vpmax_f32(res, res);
             }
-            res = vpmax_f32(vget_high_f32(vsetq_lane_f32(min_value, data.val[1], 3)), vget_low_f32(data.val[1]));
-            res = vpmax_f32(res, vpmax_f32(vget_high_f32(data.val[0]), vget_low_f32(data.val[0])));
-            res = vpmax_f32(res, res);
-        }
-        final_res = vget_lane_f32(res, 0);
+            final_res = vget_lane_f32(res, 0);
 
-        // Calculate square-root in case of l2 pooling
-        if(pool_info.pool_type == PoolingType::L2)
-        {
-            final_res = sqrt(final_res);
-        }
+            // Calculate square-root in case of l2 pooling
+            if (pool_info.pool_type == PoolingType::L2)
+            {
+                final_res = sqrt(final_res);
+            }
 
-        // Store result
-        *(reinterpret_cast<float *>(out.ptr())) = final_res;
-    },
-    in, out);
+            // Store result
+            *(reinterpret_cast<float *>(out.ptr())) = final_res;
+        },
+        in, out);
 }
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // ENABLE_NCHW_KERNELS
\ No newline at end of file
+#endif // ENABLE_NCHW_KERNELS

diff --git a/src/cpu/kernels/pool2d/neon/qasymm8.cpp b/src/cpu/kernels/pool2d/neon/qasymm8.cpp
index 7f8841e..44675b5 100644
--- a/src/cpu/kernels/pool2d/neon/qasymm8.cpp
+++ b/src/cpu/kernels/pool2d/neon/qasymm8.cpp

@@ -25,17 +25,23 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void poolingMxN_qasymm8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_qasymm8_neon_nhwc(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     poolingMxN_q8_neon_nhwc<uint8_t>(src, dst0, dst1, pool_info, window_src, window);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
index 8643651..d434323 100644
--- a/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/pool2d/neon/qasymm8_signed.cpp

@@ -25,17 +25,23 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool2d/neon/list.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_qasymm8_signed_neon_nhwc(const ITensor    *src,
+                                         ITensor          *dst0,
+                                         ITensor          *dst1,
+                                         PoolingLayerInfo &pool_info,
+                                         const Window     &window_src,
+                                         const Window     &window)
 {
     poolingMxN_q8_neon_nhwc<int8_t>(src, dst0, dst1, pool_info, window_src, window);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/pool2d/neon/quantized.h b/src/cpu/kernels/pool2d/neon/quantized.h
index a2cd399..38f1b2f 100644
--- a/src/cpu/kernels/pool2d/neon/quantized.h
+++ b/src/cpu/kernels/pool2d/neon/quantized.h

@@ -26,11 +26,13 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
+#include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/PoolingHelpers.h"
+
 #include <arm_neon.h>
 
 namespace arm_compute
@@ -38,7 +40,12 @@
 namespace cpu
 {
 template <typename T>
-void poolingMxN_q8_neon_nhwc(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_q8_neon_nhwc(const ITensor    *src,
+                             ITensor          *dst0,
+                             ITensor          *dst1,
+                             PoolingLayerInfo &pool_info,
+                             const Window     &window_src,
+                             const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
 
@@ -60,15 +67,15 @@
     using q32_t   = typename wrapper::traits::promote_t<q16_t>;
     using q32x4_t = typename wrapper::traits::neon_vector<q32_t, 4>::type;
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
     const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
     const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
     const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
     const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
 
-    int pool_stride_x = 0;
-    int pool_stride_y = 0;
+    int pool_stride_x                      = 0;
+    int pool_stride_y                      = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
@@ -80,233 +87,267 @@
     const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
     // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
+    const int32_t new_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        const int idx_width    = id.y() * pool_stride_x;
-        const int idx_height   = id.z() * pool_stride_y;
-        const int pool_limit_y = pool_pad_top - idx_height;
-        const int pool_limit_x = pool_pad_left - idx_width;
-
-        const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
-        const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
-        const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
-        const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
-
-        int x_off = window_start_x;
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            if(pool_info.pool_type != PoolingType::MAX)
+            const int idx_width    = id.y() * pool_stride_x;
+            const int idx_height   = id.z() * pool_stride_y;
+            const int pool_limit_y = pool_pad_top - idx_height;
+            const int pool_limit_x = pool_pad_left - idx_width;
+
+            const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
+            const int pool_end_y   = std::min(pool_size_y, window_src.z().end() + pool_limit_y);
+            const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
+            const int pool_end_x   = std::min(pool_size_x, window_src.y().end() + pool_limit_x);
+
+            int x_off = window_start_x;
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
             {
-                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-
-                // Calculate scale
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                if (pool_info.pool_type != PoolingType::MAX)
                 {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
+                    q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                    q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
 
-                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
-                    }
-                }
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
 
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float32x4x4_t vres =
+                    // Perform pooling
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
                         {
+                            const q8x16_t data = wrapper::vloadq(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+
+                            const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                            const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                            vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                            vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                            vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                            vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        }
+                    }
+
+                    if (src_qinfo != dst_qinfo)
+                    {
+                        const float32x4x4_t vres = {{
                             vcvtq_f32_q32(vres1),
                             vcvtq_f32_q32(vres2),
                             vcvtq_f32_q32(vres3),
                             vcvtq_f32_q32(vres4),
+                        }};
+                        const auto          requantized_dst =
+                            vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                        // Store result
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8,
+                                        wrapper::vgethigh(requantized_dst));
+                    }
+                    else
+                    {
+                        const float32x4_t scale_v = vdupq_n_f32(scale);
+                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                        vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                        vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                        vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                        vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                        const q8x8_t res1 =
+                            wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                        const q8x8_t res2 =
+                            wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                        // Store result
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                        wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
+                    }
+                }
+                else
+                {
+                    q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x16_t data = wrapper::vloadq(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = wrapper::vmax(vres, data);
                         }
-                    };
-                    const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                    }
+
                     // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                    (src_qinfo != dst_qinfo)
+                                        ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+                                                                               wrapper::vgethigh(vres), requant_qinfo)
+                                        : vres);
+                }
+            }
+
+            if (pool_info.pool_type == PoolingType::MAX)
+            {
+                for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
+                {
+                    q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const q8x8_t data = wrapper::vload(
+                                reinterpret_cast<const T *>(
+                                    in.ptr() +
+                                    (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                    (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                x_off);
+                            vres = wrapper::vmax(vres, data);
+                        }
+                    }
+
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                    (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
+                }
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                if (pool_info.pool_type != PoolingType::MAX)
+                {
+                    q32_t res = static_cast<q32_t>(0.f);
+
+                    // Calculate scale
+                    const float scale = calculate_avg_scale_pool2d(
+                        pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w,
+                        upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                    // Perform pooling
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const T data =
+                                *(reinterpret_cast<const T *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res += data;
+                        }
+                    }
+
+                    if (src_qinfo != dst_qinfo)
+                    {
+                        const float res_f          = static_cast<float>(res);
+                        const float new_scale      = quant_rescale / scale;
+                        const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+
+                        // Store result
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                    }
+                    else
+                    {
+                        // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                        res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+                        // Store result
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                    }
                 }
                 else
                 {
-                    const float32x4_t scale_v = vdupq_n_f32(scale);
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+                    T res = std::numeric_limits<T>::min();
 
-                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
-                    // Store result
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
-                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
-                }
-            }
-            else
-            {
-                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                         (src->info()->strides_in_bytes().z())) + x_off);
-                        vres               = wrapper::vmax(vres, data);
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const T data =
+                                *(reinterpret_cast<const T *>(
+                                      in.ptr() +
+                                      (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) +
+                                      (y - pool_pad_top) * static_cast<int>(src->info()->strides_in_bytes().z())) +
+                                  x_off);
+                            res = std::max(res, data);
+                        }
                     }
-                }
-
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                                requant_qinfo) :
-                                vres);
-            }
-        }
-
-        if(pool_info.pool_type == PoolingType::MAX)
-        {
-            for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
-            {
-                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                                       (src->info()->strides_in_bytes().z())) + x_off);
-                        vres              = wrapper::vmax(vres, data);
-                    }
-                }
-
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
-                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
-            }
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            if(pool_info.pool_type != PoolingType::MAX)
-            {
-                q32_t res = static_cast<q32_t>(0.f);
-
-                // Calculate scale
-                const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NHWC, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                               pool_stride_y);
-
-                // Perform pooling
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res += data;
-                    }
-                }
-
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f           = static_cast<float>(res);
-                    const float new_scale       = quant_rescale / scale;
-                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
 
                     // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
-                }
-                else
-                {
-                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
-
-                    // Store result
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-                }
-            }
-            else
-            {
-                T res = std::numeric_limits<T>::min();
-
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    if (src_qinfo != dst_qinfo)
                     {
-                        const T data = *(reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().y()) + (y - pool_pad_top) * static_cast<int>
-                                                                     (src->info()->strides_in_bytes().z())) + x_off);
-                        res          = std::max(res, data);
+                        const float res_f                           = static_cast<float>(res);
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                    }
+                    else
+                    {
+                        *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
                     }
                 }
-
-                // Store result
-                if(src_qinfo != dst_qinfo)
-                {
-                    const float res_f                           = static_cast<float>(res);
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
-                }
-                else
-                {
-                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-                }
             }
-        }
-
-    },
-    in, out);
+        },
+        in, out);
 }
 
 #if defined(ENABLE_NCHW_KERNELS)
 template <typename T, typename TVec>
-inline void scale_vector_q16x8(bool exclude_padding, TVec &v, const Coordinates &id, int id_offset, int step,
-                               const int pool_size, const int upper_bound_w, const int upper_bound_h,
-                               const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+inline void scale_vector_q16x8(bool               exclude_padding,
+                               TVec              &v,
+                               const Coordinates &id,
+                               int                id_offset,
+                               int                step,
+                               const int          pool_size,
+                               const int          upper_bound_w,
+                               const int          upper_bound_h,
+                               const int          pad_x,
+                               const int          pad_y,
+                               const int          stride_x,
+                               const int          stride_y)
 {
     int       start_x = (id.x() + id_offset) * stride_x - pad_x;
     int       start_y = id.y() * stride_y - pad_y;
     const int end_y   = std::min(start_y + pool_size, upper_bound_h);
-    if(exclude_padding)
+    if (exclude_padding)
     {
         start_y = std::max(0, start_y);
     }
 
-    std::array<T, 8> elems =
-    {
-        {
-            wrapper::vgetlane(v, 0),
-            wrapper::vgetlane(v, 1),
-            wrapper::vgetlane(v, 2),
-            wrapper::vgetlane(v, 3),
-            wrapper::vgetlane(v, 4),
-            wrapper::vgetlane(v, 5),
-            wrapper::vgetlane(v, 6),
-            wrapper::vgetlane(v, 7),
-        }
-    };
+    std::array<T, 8> elems = {{
+        wrapper::vgetlane(v, 0),
+        wrapper::vgetlane(v, 1),
+        wrapper::vgetlane(v, 2),
+        wrapper::vgetlane(v, 3),
+        wrapper::vgetlane(v, 4),
+        wrapper::vgetlane(v, 5),
+        wrapper::vgetlane(v, 6),
+        wrapper::vgetlane(v, 7),
+    }};
 
-    for(auto &el : elems)
+    for (auto &el : elems)
     {
         int       c_start_x = start_x;
         const int end_x     = std::min(c_start_x + pool_size, upper_bound_w);
-        if(exclude_padding)
+        if (exclude_padding)
         {
             c_start_x = std::max(0, c_start_x);
         }
@@ -326,15 +367,16 @@
 }
 
 template <typename T>
-auto load16_boundary_aware(int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
+auto load16_boundary_aware(
+    int srcw, int srch, int pad_l, int pad_r, int pad_t, int pad_b, int x, int y, const T *ptr, T fval)
 {
     ARM_COMPUTE_UNUSED(pad_b, pad_r);
     T vec[16];
     //handle reading a row out of the tensor
     const bool row_in_bounds((y >= pad_t) && (y < (srch + pad_t)));
-    for(int i = 0; i < 16; i++)
+    for (int i = 0; i < 16; i++)
     {
-        if(row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
+        if (row_in_bounds && (x + i >= pad_l) && (x + i < (srcw + pad_l)))
         {
             vec[i] = *(ptr + i);
         }
@@ -349,24 +391,24 @@
 template <typename T, typename V, bool deinterleave>
 inline void write16_boundary_aware(int x, int dst_w, const V &lower, const V &upper, T *ptr)
 {
-    if(deinterleave)
+    if (deinterleave)
     {
-        for(int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i * 2 + x) < dst_w; ++i)
         {
             *(ptr + i * 2) = lower[i];
         }
-        for(int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i * 2 + x + 1) < dst_w; ++i)
         {
             *(ptr + 1 + i * 2) = upper[i];
         }
     }
     else
     {
-        for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
         {
             *(ptr + i) = lower[i];
         }
-        for(int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
+        for (int i = 0; i < 8 && (i + x + 8) < dst_w; ++i)
         {
             *(ptr + i + 8) = upper[i];
         }
@@ -376,14 +418,19 @@
 template <typename T, typename V>
 inline void write8_boundary_aware(int x, int dst_w, const V &v, T *ptr)
 {
-    for(int i = 0; i < 8 && (i + x) < dst_w; ++i)
+    for (int i = 0; i < 8 && (i + x) < dst_w; ++i)
     {
         *(ptr + i) = v[i];
     }
 }
 
 template <typename T>
-void pooling2_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling2_quantized_neon_nchw(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
@@ -397,129 +444,136 @@
     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
 
-    constexpr int pool_size       = 2;
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
+    constexpr int pool_size                = 2;
+    int           pool_stride_x            = 0;
+    int           pool_stride_y            = 0;
+    const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+    const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+    const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+    const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
-    const int                     upper_bound_w        = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int                     upper_bound_h        = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const T *const                src_top_ptr          = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const                src_bottom_ptr       = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const int      upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int      upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const T *const src_top_ptr   = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
     const int                     scale_step_x         = (pool_stride_x == 1) ? 2 : 1;
     const UniformQuantizationInfo src_qinfo            = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo            = dst0->info()->quantization_info().uniform();
     const bool                    have_different_qinfo = src_qinfo != dst_qinfo;
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
-    const int                     src_w          = src->info()->dimension(0);
-    const int                     src_h          = src->info()->dimension(1);
-    const int                     dst_w          = dst0->info()->dimension(0);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
+    const int                     src_w         = src->info()->dimension(0);
+    const int                     src_h         = src->info()->dimension(1);
+    const int                     dst_w         = dst0->info()->dimension(0);
 
     const T fill_value = (pool_info.pool_type == PoolingType::MAX) ? std::numeric_limits<T>::min() : T(0);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto x_val   = id.x() * pool_stride_x;
-        const auto y_val_0 = id.y() * pool_stride_y;
-        const auto y_val_1 = (id.y() * pool_stride_y) + 1;
-
-        auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                              x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
-        auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                                 x_val, y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
-
-        q8x8_t lower_res = {};
-        q8x8_t upper_res = {};
-
-        if(pool_info.pool_type != PoolingType::MAX)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
 
-            // Add rows
-            const q16x8x2_t vrsum =
+            auto top_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_1, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+
+            q8x8_t lower_res = {};
+            q8x8_t upper_res = {};
+
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                {
+                const q16x8x2_t top_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+                const q16x8x2_t bottom_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+                // Add rows
+                const q16x8x2_t vrsum = {{
                     wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]),
                     wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]),
-                }
-            };
+                }};
 
-            // Pair-wise add row data
-            const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
-            const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
+                // Pair-wise add row data
+                const q16x4_t vpsum_1 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[0]), wrapper::vgethigh(vrsum.val[0]));
+                const q16x4_t vpsum_2 = wrapper::vpadd(wrapper::vgetlow(vrsum.val[1]), wrapper::vgethigh(vrsum.val[1]));
 
-            q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
+                q16x8_t res_lower = wrapper::vcombine(vpsum_1, vpsum_2);
 
-            // Scale lower result
-            scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x,
-                                               pool_size, upper_bound_w, upper_bound_h,
-                                               pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-            lower_res = wrapper::vmovn(res_lower);
+                // Scale lower result
+                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_lower, id, 0, scale_step_x, pool_size,
+                                                   upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                   pool_stride_x, pool_stride_y);
+                lower_res = wrapper::vmovn(res_lower);
 
-            // Compute upper result for stride_x == 1
-            if(pool_stride_x == 1)
-            {
-                // Shifted row sum
-                const q16x8x2_t vrsum_shifted =
+                // Compute upper result for stride_x == 1
+                if (pool_stride_x == 1)
                 {
-                    {
-                        wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                        wrapper::vext_1(vrsum.val[1], vrsum.val[1])
-                    }
-                };
+                    // Shifted row sum
+                    const q16x8x2_t vrsum_shifted = {
+                        {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
 
-                // Pair-wise add shifted row
-                q16x8_t res_upper = wrapper::vcombine(
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
-                                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]), wrapper::vgethigh(vrsum_shifted.val[1])));
+                    // Pair-wise add shifted row
+                    q16x8_t res_upper = wrapper::vcombine(
+                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[0]), wrapper::vgethigh(vrsum_shifted.val[0])),
+                        wrapper::vpadd(wrapper::vgetlow(vrsum_shifted.val[1]),
+                                       wrapper::vgethigh(vrsum_shifted.val[1])));
 
-                // Scale upper result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                upper_res = wrapper::vmovn(res_upper);
+                    // Scale upper result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res_upper, id, 1, 2, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    upper_res = wrapper::vmovn(res_upper);
+                }
             }
-        }
-        else
-        {
-            const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
-            lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
-            if(pool_stride_x == 1)
+            else
             {
-                const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
-                upper_res                      = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
+                const q8x16_t max_data = wrapper::vmax(top_data, bottom_data);
+                lower_res              = wrapper::vpmax(wrapper::vgetlow(max_data), wrapper::vgethigh(max_data));
+                if (pool_stride_x == 1)
+                {
+                    const q8x16_t max_data_shifted = wrapper::vext_1(max_data, max_data);
+                    upper_res = wrapper::vpmax(wrapper::vgetlow(max_data_shifted), wrapper::vgethigh(max_data_shifted));
+                }
             }
-        }
 
-        if(have_different_qinfo)
-        {
-            const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
-            lower_res                  = wrapper::vgetlow(requantized_dst);
-            upper_res                  = wrapper::vgethigh(requantized_dst);
-        }
-        auto out_ptr = reinterpret_cast<T *>(out.ptr());
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
-        }
-        else
-        {
-            write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
-        }
-    },
-    in, out);
+            if (have_different_qinfo)
+            {
+                const auto requantized_dst = vrequantize_pooling<q8x8_t, q8x16_t>(lower_res, upper_res, requant_qinfo);
+                lower_res                  = wrapper::vgetlow(requantized_dst);
+                upper_res                  = wrapper::vgethigh(requantized_dst);
+            }
+            auto out_ptr = reinterpret_cast<T *>(out.ptr());
+            // Store result
+            if (pool_stride_x == 1)
+            {
+                write16_boundary_aware<T, q8x8_t, true>(id.x(), dst_w, lower_res, upper_res, out_ptr);
+            }
+            else
+            {
+                write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, lower_res, out_ptr);
+            }
+        },
+        in, out);
 }
 
 template <typename T>
-void pooling3_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void pooling3_quantized_neon_nchw(const ITensor    *src,
+                                  ITensor          *dst0,
+                                  ITensor          *dst1,
+                                  PoolingLayerInfo &pool_info,
+                                  const Window     &window_src,
+                                  const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
@@ -533,13 +587,13 @@
     using q16x8_t   = typename wrapper::traits::neon_vector<q16_t, 8>::type;
     using q16x8x2_t = typename wrapper::traits::neon_vector<q16_t, 16>::type;
 
-    constexpr int pool_size       = 3;
-    const int     pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int     pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int     pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int     pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int           pool_stride_x   = 0;
-    int           pool_stride_y   = 0;
+    constexpr int pool_size                = 3;
+    const int     pool_pad_right           = pool_info.pad_stride_info.pad_right();
+    const int     pool_pad_top             = pool_info.pad_stride_info.pad_top();
+    const int     pool_pad_left            = pool_info.pad_stride_info.pad_left();
+    const int     pool_pad_bottom          = pool_info.pad_stride_info.pad_bottom();
+    int           pool_stride_x            = 0;
+    int           pool_stride_y            = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
@@ -547,147 +601,145 @@
     const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-    const T *const src_top_ptr    = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
-    const T *const src_middle_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
-    const T *const src_bottom_ptr = reinterpret_cast<const T *>(src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
+    const T *const src_top_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top))));
+    const T *const src_middle_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1)));
+    const T *const src_bottom_ptr = reinterpret_cast<const T *>(
+        src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2)));
 
     const int src_w      = src->info()->dimension(0);
     const int src_h      = src->info()->dimension(1);
     const T   fill_value = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
     const int dst_w      = dst0->info()->dimension(0);
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        const auto x_val   = id.x() * pool_stride_x;
-        const auto y_val_0 = id.y() * pool_stride_y;
-        const auto y_val_1 = (id.y() * pool_stride_y) + 1;
-        const auto y_val_2 = (id.y() * pool_stride_y) + 2;
-
-        auto top_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                              x_val, y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
-        auto middle_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                                 x_val, y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
-        auto bottom_data = load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom,
-                                                 x_val, y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
-
-        q8x8_t  fres  = {};
-        q8x16_t fqres = {};
-
-        if(pool_info.pool_type == PoolingType::AVG)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            // Convert data to u16
-            const q16x8x2_t top_data_q16    = { { wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data)) } };
-            const q16x8x2_t middle_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data)) } };
-            const q16x8x2_t bottom_data_q16 = { { wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data)) } };
+            const auto x_val   = id.x() * pool_stride_x;
+            const auto y_val_0 = id.y() * pool_stride_y;
+            const auto y_val_1 = (id.y() * pool_stride_y) + 1;
+            const auto y_val_2 = (id.y() * pool_stride_y) + 2;
 
-            // Calculate row sums
-            const q16x8x2_t vrsum =
+            auto top_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_0, reinterpret_cast<const T *>(src_top_ptr + in.offset()), fill_value);
+            auto middle_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_1, reinterpret_cast<const T *>(src_middle_ptr + in.offset()), fill_value);
+            auto bottom_data =
+                load16_boundary_aware(src_w, src_h, pool_pad_left, pool_pad_right, pool_pad_top, pool_pad_bottom, x_val,
+                                      y_val_2, reinterpret_cast<const T *>(src_bottom_ptr + in.offset()), fill_value);
+
+            q8x8_t  fres  = {};
+            q8x16_t fqres = {};
+
+            if (pool_info.pool_type == PoolingType::AVG)
             {
-                {
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
-                    wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
-                }
-            };
-            const q16x8x2_t vrsum_shifted_1 =
-            {
-                {
-                    wrapper::vext_1(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_1(vrsum.val[1], vrsum.val[1])
-                }
-            };
-            const q16x8x2_t vrsum_shifted_2 =
-            {
-                {
-                    wrapper::vext_2(vrsum.val[0], vrsum.val[1]),
-                    wrapper::vext_2(vrsum.val[1], vrsum.val[1])
-                }
-            };
-            // Calculate final sum
-            q16x8x2_t final_sum =
-            {
-                {
+                // Convert data to u16
+                const q16x8x2_t top_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(top_data)), wrapper::vmovl(wrapper::vgethigh(top_data))}};
+                const q16x8x2_t middle_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(middle_data)), wrapper::vmovl(wrapper::vgethigh(middle_data))}};
+                const q16x8x2_t bottom_data_q16 = {
+                    {wrapper::vmovl(wrapper::vgetlow(bottom_data)), wrapper::vmovl(wrapper::vgethigh(bottom_data))}};
+
+                // Calculate row sums
+                const q16x8x2_t vrsum           = {{
+                              wrapper::vadd(wrapper::vadd(top_data_q16.val[0], bottom_data_q16.val[0]), middle_data_q16.val[0]),
+                              wrapper::vadd(wrapper::vadd(top_data_q16.val[1], bottom_data_q16.val[1]), middle_data_q16.val[1]),
+                }};
+                const q16x8x2_t vrsum_shifted_1 = {
+                    {wrapper::vext_1(vrsum.val[0], vrsum.val[1]), wrapper::vext_1(vrsum.val[1], vrsum.val[1])}};
+                const q16x8x2_t vrsum_shifted_2 = {
+                    {wrapper::vext_2(vrsum.val[0], vrsum.val[1]), wrapper::vext_2(vrsum.val[1], vrsum.val[1])}};
+                // Calculate final sum
+                q16x8x2_t final_sum = {{
                     wrapper::vadd(wrapper::vadd(vrsum.val[0], vrsum_shifted_1.val[0]), vrsum_shifted_2.val[0]),
                     wrapper::vadd(wrapper::vadd(vrsum.val[1], vrsum_shifted_1.val[1]), vrsum_shifted_2.val[1]),
-                }
-            };
-            if(pool_stride_x == 2)
-            {
-                q16x8_t res =
+                }};
+                if (pool_stride_x == 2)
                 {
-                    wrapper::vgetlane(final_sum.val[0], 0),
-                    wrapper::vgetlane(final_sum.val[0], 2),
-                    wrapper::vgetlane(final_sum.val[0], 4),
-                    wrapper::vgetlane(final_sum.val[0], 6),
-                    wrapper::vgetlane(final_sum.val[1], 0),
-                    wrapper::vgetlane(final_sum.val[1], 2),
-                    wrapper::vgetlane(final_sum.val[1], 4),
-                    wrapper::vgetlane(final_sum.val[1], 6),
-                };
+                    q16x8_t res = {
+                        wrapper::vgetlane(final_sum.val[0], 0), wrapper::vgetlane(final_sum.val[0], 2),
+                        wrapper::vgetlane(final_sum.val[0], 4), wrapper::vgetlane(final_sum.val[0], 6),
+                        wrapper::vgetlane(final_sum.val[1], 0), wrapper::vgetlane(final_sum.val[1], 2),
+                        wrapper::vgetlane(final_sum.val[1], 4), wrapper::vgetlane(final_sum.val[1], 6),
+                    };
 
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fres = wrapper::vmovn(res);
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, res, id, 0, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    fres = wrapper::vmovn(res);
+                }
+                else
+                {
+                    // Scale lower result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    // Scale lower result
+                    scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1, pool_size,
+                                                       upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top,
+                                                       pool_stride_x, pool_stride_y);
+                    fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
+                }
             }
             else
             {
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[0], id, 0, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                // Scale lower result
-                scale_vector_q16x8<q16_t, q16x8_t>(pool_info.exclude_padding, final_sum.val[1], id, 8, 1,
-                                                   pool_size, upper_bound_w, upper_bound_h,
-                                                   pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
-                fqres = wrapper::vcombine(wrapper::vmovn(final_sum.val[0]), wrapper::vmovn(final_sum.val[1]));
-            }
-        }
-        else
-        {
-            const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
-            const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
-            const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
-            const q8x16_t final_max       = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
+                const q8x16_t max_data        = wrapper::vmax(wrapper::vmax(top_data, bottom_data), middle_data);
+                const q8x16_t max_data_shift1 = wrapper::vext_1(max_data, max_data);
+                const q8x16_t max_data_shift2 = wrapper::vext_2(max_data, max_data);
+                const q8x16_t final_max = wrapper::vmax(wrapper::vmax(max_data, max_data_shift1), max_data_shift2);
 
-            if(pool_stride_x == 2)
+                if (pool_stride_x == 2)
+                {
+                    const q8x8x2_t      table      = {{wrapper::vgetlow(final_max), wrapper::vgethigh(final_max)}};
+                    static const q8x8_t lookup_val = {0, 2, 4, 6, 8, 10, 12, 14};
+                    fres                           = wrapper::vtbl(table, lookup_val);
+                }
+                else
+                {
+                    fqres = final_max;
+                }
+            }
+
+            // Store result
+            if (pool_stride_x == 1)
             {
-                const q8x8x2_t      table      = { { wrapper::vgetlow(final_max), wrapper::vgethigh(final_max) } };
-                static const q8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 };
-                fres                           = wrapper::vtbl(table, lookup_val);
+                if (src_qinfo != dst_qinfo)
+                {
+                    fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres),
+                                                                 requant_qinfo);
+                }
+                write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres),
+                                                         wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
             }
             else
             {
-                fqres = final_max;
+                if (src_qinfo != dst_qinfo)
+                {
+                    fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
+                }
+                write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
             }
-        }
-
-        // Store result
-        if(pool_stride_x == 1)
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fqres = vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), requant_qinfo);
-            }
-            write16_boundary_aware<T, q8x8_t, false>(id.x(), dst_w, wrapper::vgetlow(fqres), wrapper::vgethigh(fqres), reinterpret_cast<T *>(out.ptr()));
-        }
-        else
-        {
-            if(src_qinfo != dst_qinfo)
-            {
-                fres = vrequantize_pooling<q8x8_t>(fres, requant_qinfo);
-            }
-            write8_boundary_aware<T, q8x8_t>(id.x(), dst_w, fres, reinterpret_cast<T *>(out.ptr()));
-        }
-    },
-    in, out);
+        },
+        in, out);
 }
 
 template <typename T>
-void poolingMxN_quantized_neon_nchw(const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info, const Window &window_src, const Window &window)
+void poolingMxN_quantized_neon_nchw(const ITensor    *src,
+                                    ITensor          *dst0,
+                                    ITensor          *dst1,
+                                    PoolingLayerInfo &pool_info,
+                                    const Window     &window_src,
+                                    const Window     &window)
 {
     ARM_COMPUTE_UNUSED(dst1);
     Iterator in(src, window_src);
@@ -697,74 +749,81 @@
     using q16_t = typename wrapper::traits::promote_t<T>;
     using q32_t = typename wrapper::traits::promote_t<q16_t>;
 
-    const int pool_size_x     = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
-    const int pool_size_y     = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
-    const int pool_pad_right  = pool_info.pad_stride_info.pad_right();
-    const int pool_pad_top    = pool_info.pad_stride_info.pad_top();
-    const int pool_pad_left   = pool_info.pad_stride_info.pad_left();
-    const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
-    int       pool_stride_x   = 0;
-    int       pool_stride_y   = 0;
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
+    const int pool_pad_right               = pool_info.pad_stride_info.pad_right();
+    const int pool_pad_top                 = pool_info.pad_stride_info.pad_top();
+    const int pool_pad_left                = pool_info.pad_stride_info.pad_left();
+    const int pool_pad_bottom              = pool_info.pad_stride_info.pad_bottom();
+    int       pool_stride_x                = 0;
+    int       pool_stride_y                = 0;
     std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
     const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
     const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
 
-    const UniformQuantizationInfo &src_qinfo        = src->info()->quantization_info().uniform();
-    const UniformQuantizationInfo &dst_qinfo        = dst0->info()->quantization_info().uniform();
-    const int                      src_w            = src->info()->dimension(0);
-    const int                      src_h            = src->info()->dimension(1);
-    const T                        fill_value       = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
-    const int                      stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
-    const int                      stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
+    const UniformQuantizationInfo &src_qinfo = src->info()->quantization_info().uniform();
+    const UniformQuantizationInfo &dst_qinfo = dst0->info()->quantization_info().uniform();
+    const int                      src_w     = src->info()->dimension(0);
+    const int                      src_h     = src->info()->dimension(1);
+    const T   fill_value       = (pool_info.pool_type == PoolingType::AVG) ? T(0) : std::numeric_limits<T>::min();
+    const int stridex_in_bytes = static_cast<int>(src->info()->strides_in_bytes().x());
+    const int stridey_in_bytes = static_cast<int>(src->info()->strides_in_bytes().y());
 
-    execute_window_loop(window, [&](const Coordinates & id)
-    {
-        T res = std::numeric_limits<T>::min();
-
-        if(pool_info.pool_type != PoolingType::MAX)
+    execute_window_loop(
+        window,
+        [&](const Coordinates &id)
         {
-            q32_t sres = 0;
+            T res = std::numeric_limits<T>::min();
 
-            // Calculate scale
-            const float scale = calculate_avg_scale_pool2d(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
-                                                           pool_stride_y);
-
-            // Perform pooling
-            for(int y = 0; y < pool_size_y; ++y)
+            if (pool_info.pool_type != PoolingType::MAX)
             {
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+                q32_t sres = 0;
 
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
-                    sres += data;
+                // Calculate scale
+                const float scale = calculate_avg_scale_pool2d(
+                    pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w,
+                    upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x, pool_stride_y);
+
+                // Perform pooling
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto in_ptr = reinterpret_cast<const T *>(
+                            in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+                        sres += data;
+                    }
+                }
+                // Divide by scale
+                res = static_cast<T>(support::cpp11::round(sres * scale));
+            }
+            else
+            {
+                for (int y = 0; y < pool_size_y; ++y)
+                {
+                    for (int x = 0; x < pool_size_x; ++x)
+                    {
+                        const auto in_ptr = reinterpret_cast<const T *>(
+                            in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
+
+                        const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
+                        const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
+                        const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
+                        res            = std::max(res, data);
+                    }
                 }
             }
-            // Divide by scale
-            res = static_cast<T>(support::cpp11::round(sres * scale));
-        }
-        else
-        {
-            for(int y = 0; y < pool_size_y; ++y)
-            {
-                for(int x = 0; x < pool_size_x; ++x)
-                {
-                    const auto in_ptr = reinterpret_cast<const T *>(in.ptr() + (x - pool_pad_left) * stridex_in_bytes + (y - pool_pad_top) * stridey_in_bytes);
-
-                    const int idx  = x + id.x() * pool_stride_x - pool_pad_left;
-                    const int idy  = y + id.y() * pool_stride_y - pool_pad_top;
-                    const T   data = (idx < 0 || idy < 0 || idx >= src_w || idy >= src_h) ? fill_value : *in_ptr;
-                    res            = std::max(res, data);
-                }
-            }
-        }
-        // Store result
-        res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo) : res;
-        *(reinterpret_cast<T *>(out.ptr())) = res;
-    },
-    in, out);
+            // Store result
+            res                                 = (src_qinfo != dst_qinfo) ? Qasymm8QuantizationHelper<T>::quantize(
+                                                                                 Qasymm8QuantizationHelper<T>::dequantize(res, src_qinfo), dst_qinfo)
+                                                                           : res;
+            *(reinterpret_cast<T *>(out.ptr())) = res;
+        },
+        in, out);
 }
 #endif /* defined(ENABLE_NCHW_KERNELS) */
 } // namespace cpu

diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
index 013e255..ce89199 100644
--- a/src/cpu/kernels/pool3d/neon/impl.h
+++ b/src/cpu/kernels/pool3d/neon/impl.h

@@ -25,9 +25,10 @@
 #define SRC_CORE_POOLING_3D_LAYER_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 #include "src/cpu/kernels/pool3d/neon/quantized.h"
 
 namespace arm_compute
@@ -37,8 +38,13 @@
 namespace
 {
 template <typename T>
-void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_start_x, const int window_end_x, const int window_step_x)
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
 
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
@@ -71,80 +77,87 @@
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        int x_off = window_start_x;
-
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmax(vres, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmax(vres, data);
+                        }
                     }
                 }
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
             }
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
-            res = -std::numeric_limits<float>::infinity();
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                T res(0);
+                res = -std::numeric_limits<float>::infinity();
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res                     = std::max(res, data);
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res                     = std::max(res, data);
+                        }
                     }
                 }
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
             }
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+        },
+        out);
 }
 
 template <typename T>
-void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                    const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                    ITensor            *dst0,
+                                    Pooling3dLayerInfo &pool_info,
+                                    const Window       &window_out,
+                                    const int           window_start_x,
+                                    const int           window_end_x,
+                                    const int           window_step_x)
 {
     using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
     using vector_type = typename vtype::type;
@@ -183,224 +196,235 @@
     Iterator out(dst0, window_out);
 
     vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
-        const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
-
-        int x_off = window_start_x;
-
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vadd(vres, data);
-                    }
-                }
-            }
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
 
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
 
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data;
-                    }
-                }
-            }
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
 
-            // Divide by scale
-            res *= scale;
-
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
-}
-
-template <typename T>
-void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
-                                   const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
-{
-    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
-    using vector_type = typename vtype::type;
-    using tag_type    = typename vtype::tag_type;
-
-    int pool_stride_x = static_cast<int>(pool_info.stride.width);
-    int pool_stride_y = static_cast<int>(pool_info.stride.height);
-    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
-
-    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
-    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
-    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
-
-    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
-    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
-    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
-    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
-    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
-    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
-
-    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
-    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
-    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
-
-    const int input_dim_w = src->info()->dimension(1);
-    const int input_dim_h = src->info()->dimension(2);
-    const int input_dim_d = src->info()->dimension(3);
-
-    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
-    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
-    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
-    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
-
-    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
-
-    Iterator out(dst0, window_out);
-
-    vector_type vres;
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x,
-                                                       pool_stride_y, pool_stride_z);
-
-        int x_off = window_start_x;
-
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
-        {
-            // Perform pooling
-            vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        vres                       = wrapper::vmla(vres, data, data);
-                    }
-                }
-            }
-
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
             const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
 
-            // Divide by scale
-            vres = wrapper::vmul(vres, scale_v);
+            int x_off = window_start_x;
 
-            // Calculate square-root
-            vres = wrapper::vinv(wrapper::vinvsqrt(vres));
-
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res(0);
-
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
                     {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data * data;
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vadd(vres, data);
+                        }
                     }
                 }
+
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
             }
 
-            // Divide by scale
-            res *= scale;
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                T res(0);
 
-            // Square root
-            res = std::sqrt(res);
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
+                    }
+                }
 
-            // Store result
-            *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-        }
-    },
-    out);
+                // Divide by scale
+                res *= scale;
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
+}
+
+template <typename T>
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor      *src,
+                                   ITensor            *dst0,
+                                   Pooling3dLayerInfo &pool_info,
+                                   const Window       &window_out,
+                                   const int           window_start_x,
+                                   const int           window_end_x,
+                                   const int           window_step_x)
+{
+    using vtype       = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+    using vector_type = typename vtype::type;
+    using tag_type    = typename vtype::tag_type;
+
+    int pool_stride_x = static_cast<int>(pool_info.stride.width);
+    int pool_stride_y = static_cast<int>(pool_info.stride.height);
+    int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+    const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+    const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+    const int pool_pad_top    = static_cast<int>(pool_info.padding.top);
+    const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+    const int pool_pad_left   = static_cast<int>(pool_info.padding.left);
+    const int pool_pad_right  = static_cast<int>(pool_info.padding.right);
+    const int pool_pad_front  = static_cast<int>(pool_info.padding.front);
+    const int pool_pad_back   = static_cast<int>(pool_info.padding.back);
+
+    const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+    const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+    const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+    const int input_dim_w = src->info()->dimension(1);
+    const int input_dim_h = src->info()->dimension(2);
+    const int input_dim_d = src->info()->dimension(3);
+
+    const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+    const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+    const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+    const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+    const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+    Iterator out(dst0, window_out);
+
+    vector_type vres;
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
+        {
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+            {
+                // Perform pooling
+                vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t    *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const vector_type data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            vres                       = wrapper::vmla(vres, data, data);
+                        }
+                    }
+                }
+
+                const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+                // Divide by scale
+                vres = wrapper::vmul(vres, scale_v);
+
+                // Calculate square-root
+                vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+            }
+
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
+            {
+                T res(0);
+
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data * data;
+                        }
+                    }
+                }
+
+                // Divide by scale
+                res *= scale;
+
+                // Square root
+                res = std::sqrt(res);
+
+                // Store result
+                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+            }
+        },
+        out);
 }
 } // namespace
 
@@ -415,16 +439,19 @@
     // Needed to handle loop left-over
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    switch(pool_info.pool_type)
+    switch (pool_info.pool_type)
     {
         case PoolingType::MAX:
-            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
             break;
         case PoolingType::AVG:
-            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                              window_step_x);
             break;
         case PoolingType::L2:
-            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+            l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x,
+                                             window_step_x);
             break;
         default:
             ARM_COMPUTE_ERROR("Pool operation not supported");
@@ -440,7 +467,7 @@
     // Needed to handle loop left-over
     window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
 
-    switch(pool_info.pool_type)
+    switch (pool_info.pool_type)
     {
         case PoolingType::MAX:
             max_poolingMxNxD_q8_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_step_x);

diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h
index ac14f5e..8819907 100644
--- a/src/cpu/kernels/pool3d/neon/quantized.h
+++ b/src/cpu/kernels/pool3d/neon/quantized.h

@@ -26,17 +26,18 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/PoolingHelpers.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
 template <typename T>
-void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_step_x)
+void avg_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
 {
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -89,144 +90,147 @@
     const float quant_rescale = dst_qinfo.scale / src_qinfo.scale;
     // "new_offset" doesn't have to consider the "half_scale_v" in its computation
     // With a requantization performed in a single step there won't be uncertainties introduced
-    const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
+    const int32_t new_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        // Calculate scale
-        const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
-                                                       pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        int x_off = window_start_x;
-
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
-            q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            // Calculate scale
+            const float scale =
+                calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z,
+                                           upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top,
+                                           pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
+                q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{});
 
-                        const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
-                        const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
-                        vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
-                        vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
-                        vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
-                        vres4                   = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            const q16x8_t data_q16  = wrapper::vmovl(wrapper::vgetlow(data));
+                            const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data));
+                            vres1                   = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16)));
+                            vres2                   = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16)));
+                            vres3                   = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16)));
+                            vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16)));
+                        }
                     }
                 }
-            }
 
-            if(src_qinfo != dst_qinfo)
-            {
-                const float32x4x4_t vres =
+                if (src_qinfo != dst_qinfo)
                 {
-                    {
+                    const float32x4x4_t vres = {{
                         vcvtq_f32_q32(vres1),
                         vcvtq_f32_q32(vres2),
                         vcvtq_f32_q32(vres3),
                         vcvtq_f32_q32(vres4),
-                    }
-                };
-                const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
-            }
-            else
-            {
-                const float32x4_t scale_v = vdupq_n_f32(scale);
-                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
-                vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
-                vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
-                vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
-
-                const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
-                const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
-                // Store result
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
-                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
-            }
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            q32_t res = static_cast<q32_t>(0.f);
-
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
-            {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
+                    }};
+                    const auto          requantized_dst =
+                        vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset);
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst));
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst));
+                }
+                else
                 {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
-                        res += data;
-                    }
+                    const float32x4_t scale_v = vdupq_n_f32(scale);
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v));
+                    vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v));
+                    vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v));
+                    vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v));
+
+                    const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2)));
+                    const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4)));
+                    // Store result
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1);
+                    wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2);
                 }
             }
 
-            if(src_qinfo != dst_qinfo)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const float res_f           = static_cast<float>(res);
-                const float new_scale       = quant_rescale / scale;
-                const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+                q32_t res = static_cast<q32_t>(0.f);
 
-                // Store result
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
-            }
-            else
-            {
-                // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
-                res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                            res += data;
+                        }
+                    }
+                }
 
-                // Store result
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f           = static_cast<float>(res);
+                    const float new_scale       = quant_rescale / scale;
+                    const auto  requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset));
+
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst;
+                }
+                else
+                {
+                    // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero
+                    res = static_cast<T>(0.5f + static_cast<float>(res) * scale);
+
+                    // Store result
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
             }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 template <typename T>
-void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
-                                    const int window_step_x)
+void max_poolingMxNxD_q8_neon_ndhwc(
+    const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x)
 
 {
     using q8x8_t  = typename wrapper::traits::neon_vector<T, 8>::type;
@@ -266,125 +270,130 @@
     const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform();
     const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform();
 
-    const float                   requant_scale  = dst_qinfo.scale / src_qinfo.scale;
-    const int32_t                 requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
-    const UniformQuantizationInfo requant_qinfo  = UniformQuantizationInfo(requant_scale, requant_offset);
+    const float   requant_scale = dst_qinfo.scale / src_qinfo.scale;
+    const int32_t requant_offset =
+        dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale);
+    const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset);
 
-    execute_window_loop(window_out, [&](const Coordinates & id)
-    {
-        // Computing the theoretical input starting/ending points
-        const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
-        const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
-        const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
-
-        const int pool_start_x = std::max(0, -in_idx_width);
-        const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
-        const int pool_start_y = std::max(0, -in_idx_height);
-        const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
-
-        const int pool_start_z = std::max(0, -in_idx_depth);
-        const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
-
-        // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
-        const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
-        const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
-        const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
-
-        const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
-
-        int x_off = window_start_x;
-
-        for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+    execute_window_loop(
+        window_out,
+        [&](const Coordinates &id)
         {
-            q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
+            // Computing the theoretical input starting/ending points
+            const int in_idx_width  = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+            const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+            const int in_idx_depth  = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
 
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            const int pool_start_x = std::max(0, -in_idx_width);
+            const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+            const int pool_start_y = std::max(0, -in_idx_height);
+            const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+            const int pool_start_z = std::max(0, -in_idx_depth);
+            const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+            // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+            const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+            const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+            const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+            const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+            int x_off = window_start_x;
+
+            for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{});
 
-                        vres = wrapper::vmax(vres, data);
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x16_t  data     = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
                 }
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo)
+                                    ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres),
+                                                                           wrapper::vgethigh(vres), requant_qinfo)
+                                    : vres);
             }
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres),
-                            requant_qinfo) :
-                            vres);
-        }
-
-        // Leftovers using half the window step
-        for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
-        {
-            q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
-
-            // Perform pooling
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Leftovers using half the window step
+            for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{});
 
-                        vres = wrapper::vmax(vres, data);
+                // Perform pooling
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const q8x8_t   data     = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            vres = wrapper::vmax(vres, data);
+                        }
                     }
                 }
+
+                // Store result
+                wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
+                                (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
             }
 
-            // Store result
-            wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off,
-                            (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres);
-        }
-
-        // Left-overs loop
-        for(; x_off < window_end_x; ++x_off)
-        {
-            T res = std::numeric_limits<T>::min();
-
-            for(int z = pool_start_z; z < pool_end_z; ++z)
+            // Left-overs loop
+            for (; x_off < window_end_x; ++x_off)
             {
-                const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
-                for(int y = pool_start_y; y < pool_end_y; ++y)
-                {
-                    const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
-                    for(int x = pool_start_x; x < pool_end_x; ++x)
-                    {
-                        const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
-                        const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+                T res = std::numeric_limits<T>::min();
 
-                        res = std::max(res, data);
+                for (int z = pool_start_z; z < pool_end_z; ++z)
+                {
+                    const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+                    for (int y = pool_start_y; y < pool_end_y; ++y)
+                    {
+                        const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+                        for (int x = pool_start_x; x < pool_end_x; ++x)
+                        {
+                            const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+                            const T        data     = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+
+                            res = std::max(res, data);
+                        }
                     }
                 }
-            }
 
-            // Store result
-            if(src_qinfo != dst_qinfo)
-            {
-                const float res_f                           = static_cast<float>(res);
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                // Store result
+                if (src_qinfo != dst_qinfo)
+                {
+                    const float res_f                           = static_cast<float>(res);
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo);
+                }
+                else
+                {
+                    *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+                }
             }
-            else
-            {
-                *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
-            }
-        }
-    },
-    out);
+        },
+        out);
 }
 
 } // namespace cpu
 } // namespace arm_compute
 
-#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
\ No newline at end of file
+#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H

diff --git a/src/cpu/kernels/range/generic/neon/fp16.cpp b/src/cpu/kernels/range/generic/neon/fp16.cpp
index 5d50dce..505c18c 100644
--- a/src/cpu/kernels/range/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/range/generic/neon/fp16.cpp

@@ -23,10 +23,10 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
 
 namespace arm_compute
 {

diff --git a/src/cpu/kernels/range/generic/neon/fp32.cpp b/src/cpu/kernels/range/generic/neon/fp32.cpp
index 6044f0f..e5e472a 100644
--- a/src/cpu/kernels/range/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/range/generic/neon/fp32.cpp

@@ -22,10 +22,10 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/range/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/range/generic/neon/impl.h"
 
 namespace arm_compute
 {

diff --git a/src/cpu/kernels/range/generic/neon/impl.h b/src/cpu/kernels/range/generic/neon/impl.h
index 62144e6..f8c30d5 100644
--- a/src/cpu/kernels/range/generic/neon/impl.h
+++ b/src/cpu/kernels/range/generic/neon/impl.h

@@ -26,8 +26,9 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/common/Registrars.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
@@ -47,35 +48,36 @@
     const auto window_end_x   = static_cast<int>(window.x().end());
     const int  window_step_x  = 16 / sizeof(T);
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator output_it(output, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        int        x       = window_start_x;
-        const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            for(int count = 0; count < window_step_x; ++count)
+            int        x       = window_start_x;
+            const auto out_ptr = reinterpret_cast<T *>(output_it.ptr());
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
             {
-                id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
+                for (int count = 0; count < window_step_x; ++count)
+                {
+                    id_vec = wrapper::vsetlane(static_cast<T>(x + count), id_vec, count);
+                }
+
+                // start + step * id
+                const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
+                wrapper::vstore(out_ptr + x, res_vec);
             }
 
-            // start + step * id
-            const auto res_vec = wrapper::vmla(start_vec, id_vec, step_vec);
-            wrapper::vstore(out_ptr + x, res_vec);
-        }
-
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            const auto res = start + x * step;
-            *(out_ptr + x) = res;
-        }
-
-    },
-    output_it);
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                const auto res = start + x * step;
+                *(out_ptr + x) = res;
+            }
+        },
+        output_it);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/range/list.h b/src/cpu/kernels/range/list.h
index 25d52bf..cade91e 100644
--- a/src/cpu/kernels/range/list.h
+++ b/src/cpu/kernels/range/list.h

@@ -28,8 +28,7 @@
 {
 namespace cpu
 {
-#define DECLARE_RANGE_KERNEL(func_name) \
-    void func_name(ITensor *output, float start, float step, const Window &window)
+#define DECLARE_RANGE_KERNEL(func_name) void func_name(ITensor *output, float start, float step, const Window &window)
 
 DECLARE_RANGE_KERNEL(fp16_neon_range_function);
 DECLARE_RANGE_KERNEL(fp32_neon_range_function);

diff --git a/src/cpu/kernels/roialign/generic/neon/fp16.cpp b/src/cpu/kernels/roialign/generic/neon/fp16.cpp
index c265d5d..cf99830 100644
--- a/src/cpu/kernels/roialign/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/fp16.cpp

@@ -29,7 +29,12 @@
 {
 namespace cpu
 {
-void neon_fp16_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_fp16_roialign(const ITensor      *input,
+                        ITensor            *output,
+                        const ITensor      *rois,
+                        ROIPoolingLayerInfo pool_info,
+                        const Window       &window,
+                        const ThreadInfo   &info)
 {
     return roi_align<float16_t, float16_t>(input, output, rois, pool_info, window, info);
 }

diff --git a/src/cpu/kernels/roialign/generic/neon/fp32.cpp b/src/cpu/kernels/roialign/generic/neon/fp32.cpp
index 51355aa..c1dba99 100644
--- a/src/cpu/kernels/roialign/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/fp32.cpp

@@ -26,7 +26,12 @@
 {
 namespace cpu
 {
-void neon_fp32_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_fp32_roialign(const ITensor      *input,
+                        ITensor            *output,
+                        const ITensor      *rois,
+                        ROIPoolingLayerInfo pool_info,
+                        const Window       &window,
+                        const ThreadInfo   &info)
 {
     return roi_align<float, float>(input, output, rois, pool_info, window, info);
 }

diff --git a/src/cpu/kernels/roialign/generic/neon/impl.h b/src/cpu/kernels/roialign/generic/neon/impl.h
index e5e6043..db2f677 100644
--- a/src/cpu/kernels/roialign/generic/neon/impl.h
+++ b/src/cpu/kernels/roialign/generic/neon/impl.h

@@ -46,7 +46,7 @@
                                      float          region_end_y,
                                      int            pz)
 {
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
     {
         return input_data_type(0);
     }
@@ -55,9 +55,9 @@
         const DataLayout data_layout = input->info()->data_layout();
         float            avg         = 0;
         // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
+        for (int iy = 0; iy < grid_size_y; ++iy)
         {
-            for(int ix = 0; ix < grid_size_x; ++ix)
+            for (int ix = 0; ix < grid_size_x; ++ix)
             {
                 // Align the window in the middle of every bin
                 float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
@@ -78,20 +78,28 @@
                 const float w2 = hy * lx;
                 const float w3 = ly * hx;
                 const float w4 = ly * lx;
-                if(data_layout == DataLayout::NCHW)
+                if (data_layout == DataLayout::NCHW)
                 {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch)));
                     avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                 }
                 else
                 {
-                    const auto data1 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
-                    const auto data2 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
-                    const auto data3 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
-                    const auto data4 = *reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
+                    const auto data1 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch)));
+                    const auto data2 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch)));
+                    const auto data3 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch)));
+                    const auto data4 = *reinterpret_cast<const input_data_type *>(
+                        input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch)));
                     avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                 }
             }
@@ -117,21 +125,21 @@
                                              int                     pz,
                                              const QuantizationInfo &out_qinfo)
 {
-    if((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
+    if ((region_end_x <= region_start_x) || (region_end_y <= region_start_y))
     {
         return input_data_type(out_qinfo.uniform().offset);
     }
     else
     {
-        float                         avg              = 0;
-        const UniformQuantizationInfo input_qinfo      = input->info()->quantization_info().uniform();
-        const bool                    is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
-        const DataLayout              data_layout      = input->info()->data_layout();
+        float                         avg         = 0;
+        const UniformQuantizationInfo input_qinfo = input->info()->quantization_info().uniform();
+        const bool       is_qasymm_signed = is_data_type_quantized_asymmetric_signed(input->info()->data_type());
+        const DataLayout data_layout      = input->info()->data_layout();
 
         // Iterate through the aligned pooling region
-        for(int iy = 0; iy < grid_size_y; ++iy)
+        for (int iy = 0; iy < grid_size_y; ++iy)
         {
-            for(int ix = 0; ix < grid_size_x; ++ix)
+            for (int ix = 0; ix < grid_size_x; ++ix)
             {
                 // Align the window in the middle of every bin
                 float y = region_start_y + (iy + 0.5) * bin_size_y / float(grid_size_y);
@@ -153,41 +161,89 @@
                 const float w3 = ly * hx;
                 const float w4 = ly * lx;
 
-                if(data_layout == DataLayout::NCHW)
+                if (data_layout == DataLayout::NCHW)
                 {
-                    if(is_qasymm_signed)
+                    if (is_qasymm_signed)
                     {
-                        float data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+                        float data1 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_low, y_low, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data2 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_high, y_low, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data3 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_low, y_high, pz, roi_batch))),
+                                                      input_qinfo);
+                        float data4 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(x_high, y_high, pz, roi_batch))),
+                                                      input_qinfo);
                         avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                     }
                     else
                     {
-                        float data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))), input_qinfo);
-                        float data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))), input_qinfo);
-                        float data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))), input_qinfo);
-                        float data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))), input_qinfo);
+                        float data1 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_low, y_low, pz, roi_batch))),
+                                               input_qinfo);
+                        float data2 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_high, y_low, pz, roi_batch))),
+                                               input_qinfo);
+                        float data3 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_low, y_high, pz, roi_batch))),
+                                               input_qinfo);
+                        float data4 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(x_high, y_high, pz, roi_batch))),
+                                               input_qinfo);
                         avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                     }
                 }
                 else
                 {
-                    if(is_qasymm_signed)
+                    if (is_qasymm_signed)
                     {
-                        const auto data1 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+                        const auto data1 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_low, y_low, roi_batch))),
+                                                      input_qinfo);
+                        const auto data2 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_high, y_low, roi_batch))),
+                                                      input_qinfo);
+                        const auto data3 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_low, y_high, roi_batch))),
+                                                      input_qinfo);
+                        const auto data4 =
+                            dequantize_qasymm8_signed(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(
+                                                          Coordinates(pz, x_high, y_high, roi_batch))),
+                                                      input_qinfo);
                         avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                     }
                     else
                     {
-                        const auto data1 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))), input_qinfo);
-                        const auto data2 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))), input_qinfo);
-                        const auto data3 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))), input_qinfo);
-                        const auto data4 = dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))), input_qinfo);
+                        const auto data1 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_low, y_low, roi_batch))),
+                                               input_qinfo);
+                        const auto data2 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_high, y_low, roi_batch))),
+                                               input_qinfo);
+                        const auto data3 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_low, y_high, roi_batch))),
+                                               input_qinfo);
+                        const auto data4 =
+                            dequantize_qasymm8(*reinterpret_cast<const input_data_type *>(
+                                                   input->ptr_to_element(Coordinates(pz, x_high, y_high, roi_batch))),
+                                               input_qinfo);
                         avg += w1 * data1 + w2 * data2 + w3 * data3 + w4 * data4;
                     }
                 }
@@ -197,7 +253,7 @@
         avg /= grid_size_x * grid_size_y;
 
         input_data_type res = 0;
-        if(is_qasymm_signed)
+        if (is_qasymm_signed)
         {
             res = quantize_qasymm8_signed(avg, out_qinfo);
         }
@@ -215,7 +271,12 @@
 }
 
 template <typename input_data_type, typename roi_data_type>
-void roi_align(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void roi_align(const ITensor      *input,
+               ITensor            *output,
+               const ITensor      *rois,
+               ROIPoolingLayerInfo pool_info,
+               const Window       &window,
+               const ThreadInfo   &info)
 {
     ARM_COMPUTE_UNUSED(info);
 
@@ -240,7 +301,7 @@
 
     const auto             *rois_ptr   = reinterpret_cast<const roi_data_type *>(rois->buffer());
     const QuantizationInfo &rois_qinfo = rois->info()->quantization_info();
-    for(int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
+    for (int roi_indx = roi_list_start; roi_indx < roi_list_end; ++roi_indx)
     {
         const unsigned int roi_batch = rois_ptr[values_per_roi * roi_indx];
 
@@ -252,7 +313,7 @@
         float         x2(qx2);
         float         y1(qy1);
         float         y2(qy2);
-        if(is_qasymm)
+        if (is_qasymm)
         {
             x1 = dequantize_qasymm16(qx1, rois_qinfo);
             x2 = dequantize_qasymm16(qx2, rois_qinfo);
@@ -267,44 +328,47 @@
         float       bin_size_y   = roi_dims_y / pool_info.pooled_height();
 
         // Iterate through all feature maps
-        for(int ch = 0; ch < input_chanels; ++ch)
+        for (int ch = 0; ch < input_chanels; ++ch)
         {
             // Iterate through all output pixels
-            for(int py = 0; py < pooled_h; ++py)
+            for (int py = 0; py < pooled_h; ++py)
             {
-                for(int px = 0; px < pooled_w; ++px)
+                for (int px = 0; px < pooled_w; ++px)
                 {
-                    const float     region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
-                    const float     region_end_x   = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
-                    const float     region_end_y   = compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
-                    const int       roi_bin_grid_x = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
-                    const int       roi_bin_grid_y = (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
+                    const float region_start_x = compute_region_coordinate(px, bin_size_x, roi_anchor_x, input_width);
+                    const float region_start_y = compute_region_coordinate(py, bin_size_y, roi_anchor_y, input_height);
+                    const float region_end_x = compute_region_coordinate(px + 1, bin_size_x, roi_anchor_x, input_width);
+                    const float region_end_y =
+                        compute_region_coordinate(py + 1, bin_size_y, roi_anchor_y, input_height);
+                    const int roi_bin_grid_x =
+                        (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_x));
+                    const int roi_bin_grid_y =
+                        (pool_info.sampling_ratio() > 0) ? pool_info.sampling_ratio() : int(ceil(bin_size_y));
                     input_data_type out_val(0);
-                    if(is_qasymm)
+                    if (is_qasymm)
                     {
                         out_val = roi_align_1x1_qasymm8<input_data_type>(
-                                      input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
+                            input, roi_batch, region_start_x, bin_size_x, roi_bin_grid_x, region_end_x, region_start_y,
+                            bin_size_y, roi_bin_grid_y, region_end_y, ch, output->info()->quantization_info());
                     }
                     else
                     {
-                        out_val = roi_align_1x1<input_data_type>(
-                                      input, roi_batch, region_start_x, bin_size_x,
-                                      roi_bin_grid_x, region_end_x, region_start_y, bin_size_y,
-                                      roi_bin_grid_y, region_end_y, ch);
+                        out_val = roi_align_1x1<input_data_type>(input, roi_batch, region_start_x, bin_size_x,
+                                                                 roi_bin_grid_x, region_end_x, region_start_y,
+                                                                 bin_size_y, roi_bin_grid_y, region_end_y, ch);
                     }
 
-                    if(data_layout == DataLayout::NCHW)
+                    if (data_layout == DataLayout::NCHW)
                     {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
-                        *out_ptr     = out_val;
+                        auto out_ptr = reinterpret_cast<input_data_type *>(
+                            output->ptr_to_element(Coordinates(px, py, ch, roi_indx)));
+                        *out_ptr = out_val;
                     }
                     else
                     {
-                        auto out_ptr = reinterpret_cast<input_data_type *>(output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
-                        *out_ptr     = out_val;
+                        auto out_ptr = reinterpret_cast<input_data_type *>(
+                            output->ptr_to_element(Coordinates(ch, px, py, roi_indx)));
+                        *out_ptr = out_val;
                     }
                 }
             }

diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
index d6bd9a9..11c5770 100644
--- a/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8.cpp

@@ -26,7 +26,12 @@
 {
 namespace cpu
 {
-void neon_qu8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_qu8_roialign(const ITensor      *input,
+                       ITensor            *output,
+                       const ITensor      *rois,
+                       ROIPoolingLayerInfo pool_info,
+                       const Window       &window,
+                       const ThreadInfo   &info)
 {
     return roi_align<uint8_t, uint16_t>(input, output, rois, pool_info, window, info);
 }

diff --git a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
index a839581..7f93cc8 100644
--- a/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/roialign/generic/neon/qasymm8_signed.cpp

@@ -26,7 +26,12 @@
 {
 namespace cpu
 {
-void neon_qs8_roialign(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+void neon_qs8_roialign(const ITensor      *input,
+                       ITensor            *output,
+                       const ITensor      *rois,
+                       ROIPoolingLayerInfo pool_info,
+                       const Window       &window,
+                       const ThreadInfo   &info)
 {
     return roi_align<int8_t, uint16_t>(input, output, rois, pool_info, window, info);
 }

diff --git a/src/cpu/kernels/roialign/list.h b/src/cpu/kernels/roialign/list.h
index 1c71b02..fdb3c00 100644
--- a/src/cpu/kernels/roialign/list.h
+++ b/src/cpu/kernels/roialign/list.h

@@ -27,9 +27,9 @@
 {
 namespace cpu
 {
-#define DECLARE_ROIALIGN_KERNEL(func_name)                                     \
-    void func_name(const ITensor *input, ITensor *output, const ITensor *rois, \
-                   ROIPoolingLayerInfo pool_info, const Window &window, const ThreadInfo &info)
+#define DECLARE_ROIALIGN_KERNEL(func_name)                                                                    \
+    void func_name(const ITensor *input, ITensor *output, const ITensor *rois, ROIPoolingLayerInfo pool_info, \
+                   const Window &window, const ThreadInfo &info)
 DECLARE_ROIALIGN_KERNEL(neon_fp32_roialign);
 DECLARE_ROIALIGN_KERNEL(neon_fp16_roialign);
 DECLARE_ROIALIGN_KERNEL(neon_qu8_roialign);

diff --git a/src/cpu/kernels/scale/neon/fp16.cpp b/src/cpu/kernels/scale/neon/fp16.cpp
index 895f422..bd01569 100644
--- a/src/cpu/kernels/scale/neon/fp16.cpp
+++ b/src/cpu/kernels/scale/neon/fp16.cpp

@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -41,8 +42,12 @@
 {
 namespace
 {
-void fp16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                             float sampling_offset, bool align_corners, const Window &window)
+void fp16_neon_scale_nearest(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *offsets,
+                             float          sampling_offset,
+                             bool           align_corners,
+                             const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -62,33 +67,46 @@
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t    offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto       in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int        offset_row = in_hi * in_stride_wc;
-        int32_t          x          = window_start_x;
-        const float16_t *in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int        offset_row = in_hi * in_stride_wc;
+            int32_t          x          = window_start_x;
+            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
 
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<float16_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<float16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
 }
 
-void fp16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
+void fp16_neon_scale_bilinear(const ITensor *src,
+                              ITensor       *dst,
+                              const ITensor *offsets,
+                              const ITensor *dx,
+                              const ITensor *dy,
+                              BorderMode     border_mode,
+                              PixelValue     constant_border_value,
+                              float          sampling_offset,
+                              bool           align_corners,
+                              const Window  &window)
 {
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     Iterator  out(dst, window);
     const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
@@ -103,68 +121,97 @@
     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
     Iterator in(src, win_in);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         using ConstType = typename std::conditional<std::is_same<float16_t, float16_t>::value, half, float16_t>::type;
 
         const float16_t const_border_value = static_cast<float16_t>(constant_border_value.get<ConstType>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto       offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto       dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t    in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const float16_t *in_ptr = reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t    in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+                const float16_t *in_ptr =
+                    reinterpret_cast<const float16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
 
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
+                const auto a00 =
+                    (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+                const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
 
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+                *reinterpret_cast<float16_t *>(out.ptr()) =
+                    static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
 
-            auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+                auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+                auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+                auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+                auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
 
-            const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
+                const auto a00 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a01 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a10 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+                const auto a11 = *(reinterpret_cast<const float16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
 
-            *reinterpret_cast<float16_t *>(out.ptr()) = static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+                *reinterpret_cast<float16_t *>(out.ptr()) =
+                    static_cast<float16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
     else
     {
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void fp16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                     InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                     bool align_corners, const Window &window)
+void fp16_neon_scale(const ITensor      *src,
+                     ITensor            *dst,
+                     const ITensor      *offsets,
+                     const ITensor      *dx,
+                     const ITensor      *dy,
+                     InterpolationPolicy policy,
+                     BorderMode          border_mode,
+                     PixelValue          constant_border_value,
+                     float               sampling_offset,
+                     bool                align_corners,
+                     const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        fp16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                 align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         fp16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
@@ -172,4 +219,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/scale/neon/integer.cpp b/src/cpu/kernels/scale/neon/integer.cpp
index 2ab14cf..bbf92e0 100644
--- a/src/cpu/kernels/scale/neon/integer.cpp
+++ b/src/cpu/kernels/scale/neon/integer.cpp

@@ -22,8 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
-#include "src/core/NEON/wrapper/wrapper.h"
+
 #include "src/core/helpers/ScaleHelpers.h"
+#include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -33,8 +34,12 @@
 {
 namespace
 {
-void u8_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
+void u8_neon_scale_nearest(const ITensor *src,
+                           ITensor       *dst,
+                           const ITensor *offsets,
+                           float          sampling_offset,
+                           bool           align_corners,
+                           const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -54,43 +59,58 @@
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int      offset_row = in_hi * in_stride_wc;
+            int32_t        x          = window_start_x;
+            const uint8_t *in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
 
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<uint8_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<uint8_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
 }
 
-void u8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
+void u8_neon_scale_bilinear(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            BorderMode     border_mode,
+                            PixelValue     constant_border_value,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     const int input_width  = src->info()->dimension(1);
     const int input_height = src->info()->dimension(2);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         Iterator  out(dst, window);
-        const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
-        const int in_stride_wc = in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom);
+        const int in_stride_c = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
+        const int in_stride_wc =
+            in_stride_c * (input_width + src->info()->padding().top + src->info()->padding().bottom);
 
         // Don't increment in Y and Z direction for the input tensor
         // A pointer to the start of this plane is needed as base for the precomputed offsets
@@ -100,24 +120,37 @@
         Iterator in(src, win_in);
 
         const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset);
-            const uint8_t *in_ptr = reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t  in_hi = std::floor((id.z() + sampling_offset) * scale_y - sampling_offset);
+                const uint8_t *in_ptr =
+                    reinterpret_cast<const uint8_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
 
-            const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
+                const auto a00 = (0 <= offset && offset < input_width && 0 <= in_hi && in_hi < input_height)
+                                     ? *in_ptr
+                                     : const_border_value;
+                const auto a01 = (-1 <= offset && offset < input_width - 1 && 0 <= in_hi && in_hi < input_height)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < input_width && -1 <= in_hi && in_hi < input_height - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < input_width - 1 && -1 <= in_hi && in_hi < input_height - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
 
-            *reinterpret_cast<uint8_t *>(out.ptr()) = static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+                *reinterpret_cast<uint8_t *>(out.ptr()) =
+                    static_cast<uint8_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
         using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
 
@@ -152,12 +185,12 @@
         const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
         const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_b;
             uint8_t       *out_ptr = out.ptr() + bo * out_stride_b;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -174,7 +207,7 @@
                 const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -205,7 +238,7 @@
                     uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -270,19 +303,21 @@
                         const auto out_2_int = wrapper::vcvta<uint32_t>(out_2);
                         const auto out_3_int = wrapper::vcvta<uint32_t>(out_3);
 #else  // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto out_0_int = wrapper::vcvt<uint32_t>(out_0);
-                        const auto out_1_int = wrapper::vcvt<uint32_t>(out_1);
-                        const auto out_2_int = wrapper::vcvt<uint32_t>(out_2);
-                        const auto out_3_int = wrapper::vcvt<uint32_t>(out_3);
+                        const auto out_0_int                      = wrapper::vcvt<uint32_t>(out_0);
+                        const auto out_1_int                      = wrapper::vcvt<uint32_t>(out_1);
+                        const auto out_2_int                      = wrapper::vcvt<uint32_t>(out_2);
+                        const auto out_3_int                      = wrapper::vcvt<uint32_t>(out_3);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -311,18 +346,27 @@
     }
 }
 
-void s8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                            BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                            bool align_corners, const Window &window)
+void s8_neon_scale_bilinear(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            const ITensor *dx,
+                            const ITensor *dy,
+                            BorderMode     border_mode,
+                            PixelValue     constant_border_value,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, offsets, constant_border_value);
-    if(border_mode == BorderMode::REPLICATE)
+    if (border_mode == BorderMode::REPLICATE)
     {
         using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
 
         // Compute the ratio between source and destination dimensions
-        const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-        const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+        const float scale_x =
+            scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+        const float scale_y =
+            scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
         const int     in_stride_x  = src->info()->strides_in_bytes()[1];
         const int     in_stride_y  = src->info()->strides_in_bytes()[2];
@@ -356,12 +400,12 @@
         const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
         const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const int8_t *in_ptr  = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
             int8_t       *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -378,7 +422,7 @@
                 const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -409,7 +453,7 @@
                     int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -479,14 +523,16 @@
                         const auto out_2_int                      = wrapper::vcvt<int32_t>(out_2);
                         const auto out_3_int                      = wrapper::vcvt<int32_t>(out_3);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -515,8 +561,12 @@
     }
 }
 
-void s16_neon_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
+void s16_neon_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -536,33 +586,46 @@
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t  offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto     in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int      offset_row = in_hi * in_stride_wc;
-        int32_t        x          = window_start_x;
-        const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
+        {
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int      offset_row = in_hi * in_stride_wc;
+            int32_t        x          = window_start_x;
+            const int16_t *in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
 
-        for(; x <= window_end_x - window_step_x; x += window_step_x)
-        {
-            wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
-                            wrapper::vloadq(in_ptr + offset + offset_row + x));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
-        }
-    },
-    out);
+            for (; x <= window_end_x - window_step_x; x += window_step_x)
+            {
+                wrapper::vstore(reinterpret_cast<int16_t *>(out.ptr()) + x,
+                                wrapper::vloadq(in_ptr + offset + offset_row + x));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                *(reinterpret_cast<int16_t *>(out.ptr()) + x) = *(in_ptr + offset + offset_row + x);
+            }
+        },
+        out);
 }
 
-void s16_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                             BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                             bool align_corners, const Window &window)
+void s16_neon_scale_bilinear(const ITensor *src,
+                             ITensor       *dst,
+                             const ITensor *offsets,
+                             const ITensor *dx,
+                             const ITensor *dy,
+                             BorderMode     border_mode,
+                             PixelValue     constant_border_value,
+                             float          sampling_offset,
+                             bool           align_corners,
+                             const Window  &window)
 {
     // Compute the ratio between source height and destination height
-    const auto hr = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const auto hr =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     Iterator  out(dst, window);
     const int in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
@@ -577,64 +640,93 @@
     win_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
     Iterator in(src, win_in);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         const int16_t const_border_value = static_cast<int16_t>(constant_border_value.get<int16_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto     offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto     dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int32_t  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
-            const int16_t *in_ptr = reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int32_t  in_hi = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+                const int16_t *in_ptr =
+                    reinterpret_cast<const int16_t *>(in.ptr()) + offset * in_stride_c + in_hi * in_stride_wc;
 
-            const auto a00 = (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
-            const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h) ? *(in_ptr + in_stride_c) : const_border_value;
-            const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_wc) : const_border_value;
-            const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1) ? *(in_ptr + in_stride_c + in_stride_wc) : const_border_value;
+                const auto a00 =
+                    (0 <= offset && offset < in_dim_w && 0 <= in_hi && in_hi < in_dim_h) ? *in_ptr : const_border_value;
+                const auto a01 = (-1 <= offset && offset < in_dim_w - 1 && 0 <= in_hi && in_hi < in_dim_h)
+                                     ? *(in_ptr + in_stride_c)
+                                     : const_border_value;
+                const auto a10 = (0 <= offset && offset < in_dim_w && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_wc)
+                                     : const_border_value;
+                const auto a11 = (-1 <= offset && offset < in_dim_w - 1 && -1 <= in_hi && in_hi < in_dim_h - 1)
+                                     ? *(in_ptr + in_stride_c + in_stride_wc)
+                                     : const_border_value;
 
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+                *reinterpret_cast<int16_t *>(out.ptr()) =
+                    static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const auto offset = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
-            const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
-            const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const auto offset =
+                    *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dx_val = *reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id.y(), id.z())));
+                const auto dy_val = *reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id.y(), id.z())));
+                const int  in_hi  = std::floor((id.z() + sampling_offset) * hr - sampling_offset);
 
-            const auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
-            const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
-            const auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
-            const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
+                const auto clamped_w  = utility::clamp<int>(offset, 0, in_dim_w - 1);
+                const auto clamped_w1 = utility::clamp<int>(offset + 1, 0, in_dim_w - 1);
+                const auto clamped_h  = utility::clamp<int>(in_hi, 0, in_dim_h - 1);
+                const auto clamped_h1 = utility::clamp<int>(in_hi + 1, 0, in_dim_h - 1);
 
-            const auto a00 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
-            const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h * in_stride_wc);
-            const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h1 * in_stride_wc);
-            const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c + clamped_h1 * in_stride_wc);
+                const auto a00 =
+                    *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c + clamped_h * in_stride_wc);
+                const auto a01 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h * in_stride_wc);
+                const auto a10 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
+                const auto a11 = *(reinterpret_cast<const int16_t *>(in.ptr()) + clamped_w1 * in_stride_c +
+                                   clamped_h1 * in_stride_wc);
 
-            *reinterpret_cast<int16_t *>(out.ptr()) = static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
-        },
-        in, out);
+                *reinterpret_cast<int16_t *>(out.ptr()) =
+                    static_cast<int16_t>(scale_helpers::delta_bilinear(a00, a01, a10, a11, dx_val, dy_val));
+            },
+            in, out);
     }
     else
     {
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void s8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
+void s8_neon_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        s8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
     }
     else
     {
@@ -642,32 +734,50 @@
     }
 }
 
-void u8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
+void u8_neon_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        u8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         u8_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 
-void s16_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
+void s16_neon_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        s16_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         s16_neon_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/scale/neon/list.h b/src/cpu/kernels/scale/neon/list.h
index 28a1087..0fe87d1 100644
--- a/src/cpu/kernels/scale/neon/list.h
+++ b/src/cpu/kernels/scale/neon/list.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
@@ -34,10 +35,10 @@
 {
 namespace cpu
 {
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
+#define DECLARE_SCALE_KERNEL(func_name)                                                                            \
+    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,           \
+                   float sampling_offset, bool align_corners, const Window &window)
 
 DECLARE_SCALE_KERNEL(s16_neon_scale);
 DECLARE_SCALE_KERNEL(u8_neon_scale);
@@ -48,14 +49,20 @@
 #undef DECLARE_SCALE_KERNEL
 
 template <typename T>
-void nearest_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, float sampling_offset,
-                        bool align_corners, const Window &window)
+void nearest_neon_scale(const ITensor *src,
+                        ITensor       *dst,
+                        const ITensor *offsets,
+                        float          sampling_offset,
+                        bool           align_corners,
+                        const Window  &window)
 {
     ARM_COMPUTE_UNUSED(offsets);
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     const int in_stride_y  = src->info()->strides_in_bytes()[1];
     const int in_stride_z  = src->info()->strides_in_bytes()[2];
@@ -84,17 +91,17 @@
     const int bo_end   = window_execution[3].end();
     const int bo_step  = window_execution[3].step();
 
-    for(int bo = bo_start; bo < bo_end; bo += bo_step)
+    for (int bo = bo_start; bo < bo_end; bo += bo_step)
     {
         const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
         uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
 
-        for(int yo = yo_start; yo < yo_end; yo += yo_step)
+        for (int yo = yo_start; yo < yo_end; yo += yo_step)
         {
             // Floating-point coordinate
             float yi_f = ((yo + sampling_offset) * scale_y);
             int   yi   = 0;
-            if(align_corners)
+            if (align_corners)
             {
                 yi = utils::rounding::round_half_away_from_zero(yi_f);
             }
@@ -103,12 +110,12 @@
                 yi = static_cast<int>(std::floor(yi_f));
             }
 
-            for(int xo = xo_start; xo < xo_end; xo += xo_step)
+            for (int xo = xo_start; xo < xo_end; xo += xo_step)
             {
                 // Floating-point coordinate
                 float xi_f = ((xo + sampling_offset) * scale_x);
                 int   xi   = 0;
-                if(align_corners)
+                if (align_corners)
                 {
                     xi = utils::rounding::round_half_away_from_zero(xi_f);
                 }
@@ -121,15 +128,15 @@
                 uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
 
                 int cout = 0;
-                for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                 {
                     auto out0 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                     wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
                 }
 
-                for(; cout < out_dim_ch; ++cout)
+                for (; cout < out_dim_ch; ++cout)
                 {
-                    auto out0                                            = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
+                    auto out0 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                     *(reinterpret_cast<T *>(out_ptr + cout * sizeof(T))) = out0;
                 }
             }
@@ -138,9 +145,16 @@
 }
 
 template <typename T>
-void bilinear_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                         BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                         bool align_corners, const Window &window)
+void bilinear_neon_scale(const ITensor *src,
+                         ITensor       *dst,
+                         const ITensor *offsets,
+                         const ITensor *dx,
+                         const ITensor *dy,
+                         BorderMode     border_mode,
+                         PixelValue     constant_border_value,
+                         float          sampling_offset,
+                         bool           align_corners,
+                         const Window  &window)
 {
     ARM_COMPUTE_UNUSED(offsets);
     ARM_COMPUTE_UNUSED(dx);
@@ -148,8 +162,10 @@
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
     const int in_stride_y  = src->info()->strides_in_bytes()[1];
     const int in_stride_z  = src->info()->strides_in_bytes()[2];
@@ -180,7 +196,7 @@
     const int bo_end   = window_execution[3].end();
     const int bo_step  = window_execution[3].step();
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         using ConstType = typename std::conditional<std::is_same<T, float16_t>::value, half, T>::type;
@@ -189,12 +205,12 @@
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         const T const_border_value = static_cast<T>(constant_border_value.get<ConstType>());
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr_base  = in.ptr() + bo * in_stride_w;
             uint8_t       *out_ptr_base = out.ptr() + bo * out_stride_w;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
@@ -204,7 +220,7 @@
                 const auto a1 = (yi_f - static_cast<float>(yi));
                 const auto b1 = (1.f - a1);
 
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
@@ -223,32 +239,35 @@
                     uint8_t       *out_ptr = out_ptr_base + xo * out_stride_y + yo * out_stride_z;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         auto in00 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
                         auto in01 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
                         auto in10 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
                         auto in11 = wrapper::vdup_n(static_cast<T>(const_border_value), ExactTagType{});
-                        if((yi >= 0) && (yi < in_dim_h))
+                        if ((yi >= 0) && (yi < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
                                 in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
-                                in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
+                                in01 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
                             }
                         }
-                        if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+                        if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
-                                in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
+                                in10 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
-                                in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+                                in11 = wrapper::vloadq(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
                             }
                         }
 
@@ -264,32 +283,33 @@
                         wrapper::vstore(reinterpret_cast<T *>(out_ptr + cout * sizeof(T)), out0);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         auto in00 = static_cast<T>(const_border_value);
                         auto in01 = static_cast<T>(const_border_value);
                         auto in10 = static_cast<T>(const_border_value);
                         auto in11 = static_cast<T>(const_border_value);
-                        if((yi >= 0) && (yi < in_dim_h))
+                        if ((yi >= 0) && (yi < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
                                 in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T)));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
                                 in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y));
                             }
                         }
-                        if(((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
+                        if (((yi + 1) >= 0) && ((yi + 1) < in_dim_h))
                         {
-                            if((xi >= 0) && (xi < in_dim_w))
+                            if ((xi >= 0) && (xi < in_dim_w))
                             {
                                 in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_z));
                             }
-                            if(((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
+                            if (((xi + 1) >= 0) && ((xi + 1) < in_dim_w))
                             {
-                                in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
+                                in11 = *(
+                                    reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + in_stride_y + in_stride_z));
                             }
                         }
                         auto out0 = static_cast<T>(0);
@@ -303,14 +323,14 @@
             }
         }
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_w;
             uint8_t       *out_ptr = out.ptr() + bo * out_stride_w;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = ((yo + sampling_offset) * scale_y - sampling_offset);
@@ -327,7 +347,7 @@
                 const int yi1_offset = yi1 * in_stride_z;
 
                 const int y_offset = yo * out_stride_z;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = ((xo + sampling_offset) * scale_x - sampling_offset);
@@ -356,12 +376,16 @@
                     const int offset = xo * out_stride_y + y_offset;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
-                        const auto in00 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
-                        const auto in01 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
-                        const auto in10 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
-                        const auto in11 = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+                        const auto in00 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+                        const auto in01 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+                        const auto in10 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+                        const auto in11 = wrapper::vloadq(
+                            reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
 
                         auto out0 = wrapper::vmul(in00, s00);
                         out0      = wrapper::vmla(out0, in01, s01);
@@ -370,12 +394,16 @@
                         wrapper::vstore(reinterpret_cast<T *>(out_ptr + offset + cout * sizeof(T)), out0);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
-                        const T in00 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
-                        const T in01 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
-                        const T in10 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
-                        const T in11 = *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
+                        const T in00 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi0_offset));
+                        const T in01 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi0_offset));
+                        const T in10 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi0_offset + yi1_offset));
+                        const T in11 =
+                            *(reinterpret_cast<const T *>(in_ptr + cout * sizeof(T) + xi1_offset + yi1_offset));
 
                         T out0 = in00 * s00_s;
                         out0 += in01 * s01_s;
@@ -394,15 +422,24 @@
 }
 
 template <typename T>
-void common_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
+void common_neon_scale(const ITensor      *src,
+                       ITensor            *dst,
+                       const ITensor      *offsets,
+                       const ITensor      *dx,
+                       const ITensor      *dy,
+                       InterpolationPolicy policy,
+                       BorderMode          border_mode,
+                       PixelValue          constant_border_value,
+                       float               sampling_offset,
+                       bool                align_corners,
+                       const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+        bilinear_neon_scale<T>(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                               align_corners, window);
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         nearest_neon_scale<T>(src, dst, offsets, sampling_offset, align_corners, window);
     }

diff --git a/src/cpu/kernels/scale/neon/qasymm8.cpp b/src/cpu/kernels/scale/neon/qasymm8.cpp
index 778459a..62a821d 100644
--- a/src/cpu/kernels/scale/neon/qasymm8.cpp
+++ b/src/cpu/kernels/scale/neon/qasymm8.cpp

@@ -28,9 +28,16 @@
 {
 namespace
 {
-void qasymm8_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                 BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                 bool align_corners, const Window &window)
+void qasymm8_neon_scale_bilinear(const ITensor *src,
+                                 ITensor       *dst,
+                                 const ITensor *offsets,
+                                 const ITensor *dx,
+                                 const ITensor *dy,
+                                 BorderMode     border_mode,
+                                 PixelValue     constant_border_value,
+                                 float          sampling_offset,
+                                 bool           align_corners,
+                                 const Window  &window)
 {
     // Data layout is NHWC
     const int32_t input_width  = src->info()->dimension(1);
@@ -40,10 +47,12 @@
     const UniformQuantizationInfo oq_info = dst->info()->quantization_info().uniform();
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
         const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
@@ -59,7 +68,7 @@
         win_in.set(1, Window::Dimension(0, 0, 0));
         win_in.set(2, Window::Dimension(0, 0, 0));
 
-        for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+        for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
         {
             win_off.set(d, Window::Dimension(0, 0, 0));
         }
@@ -68,36 +77,41 @@
         Iterator out(dst, window);
 
         const uint8_t const_border_value = static_cast<uint8_t>(constant_border_value.get<uint8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+                const int32_t index_w =
+                    *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto pixel_row_ptr = reinterpret_cast<const uint8_t *>(in.ptr());
 
-            const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
+                const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+                                     : const_border_value;
+                const auto a11 =
+                    (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+                        ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+                        : const_border_value;
 
-            const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
-            const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
-            const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
-            const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
+                const float inp00                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a00, iq_info);
+                const float inp01                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a01, iq_info);
+                const float inp10                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a10, iq_info);
+                const float inp11                       = Qasymm8QuantizationHelper<uint8_t>::dequantize(a11, iq_info);
+                *reinterpret_cast<uint8_t *>(out.ptr()) = Qasymm8QuantizationHelper<uint8_t>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
         using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
         using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
@@ -141,12 +155,12 @@
         const float fp_coord_offset_y = sampling_offset * (scale_y - 1);
         const float fp_coord_offset_x = sampling_offset * (scale_x - 1);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const uint8_t *in_ptr  = in.ptr() + bo * in_stride_b;
             uint8_t       *out_ptr = out.ptr() + bo * out_stride_b;
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -163,7 +177,7 @@
                 const uint8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 uint8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -194,7 +208,7 @@
                     uint8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -204,34 +218,82 @@
                         const uint16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
                         const uint16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
 
-                        const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)), vscale_in);
-                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)), vscale_in);
-                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)), vscale_in);
-                        const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)), vscale_in);
+                        const auto in00_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_low))), voffset_in)),
+                            vscale_in);
+                        const auto in00_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_low))), voffset_in)),
+                            vscale_in);
+                        const auto in00_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in00_high))), voffset_in)),
+                            vscale_in);
+                        const auto in00_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in00_high))), voffset_in)),
+                            vscale_in);
 
                         const uint16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
                         const uint16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
 
-                        const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)), vscale_in);
-                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)), vscale_in);
-                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)), vscale_in);
-                        const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)), vscale_in);
+                        const auto in01_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_low))), voffset_in)),
+                            vscale_in);
+                        const auto in01_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_low))), voffset_in)),
+                            vscale_in);
+                        const auto in01_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in01_high))), voffset_in)),
+                            vscale_in);
+                        const auto in01_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in01_high))), voffset_in)),
+                            vscale_in);
 
                         const uint16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
                         const uint16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
 
-                        const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)), vscale_in);
-                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)), vscale_in);
-                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)), vscale_in);
-                        const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)), vscale_in);
+                        const auto in10_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_low))), voffset_in)),
+                            vscale_in);
+                        const auto in10_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_low))), voffset_in)),
+                            vscale_in);
+                        const auto in10_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in10_high))), voffset_in)),
+                            vscale_in);
+                        const auto in10_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in10_high))), voffset_in)),
+                            vscale_in);
 
                         const uint16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
                         const uint16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
 
-                        const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)), vscale_in);
-                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)), vscale_in);
-                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)), vscale_in);
-                        const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)), vscale_in);
+                        const auto in11_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_low))), voffset_in)),
+                            vscale_in);
+                        const auto in11_1 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_low))), voffset_in)),
+                            vscale_in);
+                        const auto in11_2 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgetlow(in11_high))), voffset_in)),
+                            vscale_in);
+                        const auto in11_3 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(
+                                wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(in11_high))), voffset_in)),
+                            vscale_in);
 
                         auto out_0 = wrapper::vmul(in00_0, s00);
                         out_0      = wrapper::vmla(out_0, in01_0, s01);
@@ -264,14 +326,16 @@
                         const auto out_2_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
                         const auto out_3_int = wrapper::vcvt<uint32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(uint8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const uint8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(uint8_t));
                         const uint8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(uint8_t));
@@ -292,7 +356,8 @@
 #if defined(__aarch64__) && !defined(BARE_METAL)
                         *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info);
 #else  // defined(__aarch64__) && !defined(BARE_METAL)
-                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) = quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO);
+                        *(out_ptr_xo_yo + cout * sizeof(uint8_t)) =
+                            quantize_qasymm8(out, oq_info, RoundingPolicy::TO_ZERO);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
                     }
                 }
@@ -304,28 +369,38 @@
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void qasymm8_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                        InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                        bool align_corners, const Window &window)
+void qasymm8_neon_scale(const ITensor      *src,
+                        ITensor            *dst,
+                        const ITensor      *offsets,
+                        const ITensor      *dx,
+                        const ITensor      *dy,
+                        InterpolationPolicy policy,
+                        BorderMode          border_mode,
+                        PixelValue          constant_border_value,
+                        float               sampling_offset,
+                        bool                align_corners,
+                        const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        if(src->info()->quantization_info() == dst->info()->quantization_info())
+        if (src->info()->quantization_info() == dst->info()->quantization_info())
         {
-            u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            u8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+                          align_corners, window);
         }
         else
         {
-            qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            qasymm8_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset,
+                                        align_corners, window);
         }
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         nearest_neon_scale<uint8_t>(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
index cd63dfb..5a88517 100644
--- a/src/cpu/kernels/scale/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/scale/neon/qasymm8_signed.cpp

@@ -28,9 +28,16 @@
 {
 namespace
 {
-void qasymm8_signed_neon_scale_bilinear(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                                        BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                                        bool align_corners, const Window &window)
+void qasymm8_signed_neon_scale_bilinear(const ITensor *src,
+                                        ITensor       *dst,
+                                        const ITensor *offsets,
+                                        const ITensor *dx,
+                                        const ITensor *dy,
+                                        BorderMode     border_mode,
+                                        PixelValue     constant_border_value,
+                                        float          sampling_offset,
+                                        bool           align_corners,
+                                        const Window  &window)
 {
     // Data layout is NHWC
     const UniformQuantizationInfo iq_info = src->info()->quantization_info().uniform();
@@ -40,10 +47,12 @@
     const int32_t input_height = src->info()->dimension(2);
 
     // Compute the ratio between source and destination dimensions
-    const float scale_x = scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
-    const float scale_y = scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(1), dst->info()->dimension(1), align_corners);
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(src->info()->dimension(2), dst->info()->dimension(2), align_corners);
 
-    if(border_mode == BorderMode::CONSTANT)
+    if (border_mode == BorderMode::CONSTANT)
     {
         const int32_t in_stride_y = src->info()->strides_in_bytes()[1];
         const int32_t in_stride_z = src->info()->strides_in_bytes()[2];
@@ -58,7 +67,7 @@
         win_in.set(1, Window::Dimension(0, 0, 0));
         win_in.set(2, Window::Dimension(0, 0, 0));
 
-        for(size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
+        for (size_t d = Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
         {
             win_off.set(d, Window::Dimension(0, 0, 0));
         }
@@ -67,36 +76,41 @@
         Iterator out(dst, window);
 
         const int8_t const_border_value = static_cast<int8_t>(constant_border_value.get<int8_t>());
-        execute_window_loop(window, [&](const Coordinates & id)
-        {
-            const int32_t index_h       = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
-            const int32_t index_w       = *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dx_val        = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    dy_val        = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
-            const auto    pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
+        execute_window_loop(
+            window,
+            [&](const Coordinates &id)
+            {
+                const int32_t index_h = std::floor((id[2] + sampling_offset) * scale_y - sampling_offset);
+                const int32_t index_w =
+                    *(reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dx_val = *(reinterpret_cast<const float *>(dx->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto dy_val = *(reinterpret_cast<const float *>(dy->ptr_to_element(Coordinates(id[1], id[2]))));
+                const auto pixel_row_ptr = reinterpret_cast<const int8_t *>(in.ptr());
 
-            const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z)) :
-                             const_border_value;
-            const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
-            const auto a11 = (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1) ?
-                             (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z)) :
-                             const_border_value;
+                const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
+                                     ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
+                                     : const_border_value;
+                const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
+                                     ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
+                                     : const_border_value;
+                const auto a11 =
+                    (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
+                        ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
+                        : const_border_value;
 
-            const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
-            const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
-            const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
-            const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
-            *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
-        },
-        in, out);
+                const float inp00                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a00, iq_info);
+                const float inp01                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a01, iq_info);
+                const float inp10                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a10, iq_info);
+                const float inp11                      = Qasymm8QuantizationHelper<int8_t>::dequantize(a11, iq_info);
+                *reinterpret_cast<int8_t *>(out.ptr()) = Qasymm8QuantizationHelper<int8_t>::quantize(
+                    scale_helpers::delta_bilinear(inp00, inp01, inp10, inp11, dx_val, dy_val), oq_info);
+            },
+            in, out);
     }
-    else if(border_mode == BorderMode::REPLICATE)
+    else if (border_mode == BorderMode::REPLICATE)
     {
         using FloatTagType = typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
         using Int32TagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
@@ -140,12 +154,12 @@
         const float32x4_t invvscale_o = wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{});
         const float32x4_t voffset_o   = vdupq_n_f32(oq_info.offset);
 
-        for(int bo = bo_start; bo < bo_end; bo += bo_step)
+        for (int bo = bo_start; bo < bo_end; bo += bo_step)
         {
             const int8_t *in_ptr  = reinterpret_cast<int8_t *>(in.ptr() + bo * in_stride_b);
             int8_t       *out_ptr = reinterpret_cast<int8_t *>(out.ptr() + bo * out_stride_b);
 
-            for(int yo = yo_start; yo < yo_end; yo += yo_step)
+            for (int yo = yo_start; yo < yo_end; yo += yo_step)
             {
                 // Floating-point coordinate
                 const float yi_f = yo * scale_y + fp_coord_offset_y;
@@ -162,7 +176,7 @@
                 const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
 
                 int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
-                for(int xo = xo_start; xo < xo_end; xo += xo_step)
+                for (int xo = xo_start; xo < xo_end; xo += xo_step)
                 {
                     // Floating-point coordinate
                     const float xi_f = xo * scale_x + fp_coord_offset_x;
@@ -193,7 +207,7 @@
                     int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
 
                     int cout = 0;
-                    for(; cout <= (out_dim_ch - step_cout); cout += step_cout)
+                    for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
                     {
                         const auto in00 = wrapper::vloadq(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const auto in01 = wrapper::vloadq(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -203,34 +217,70 @@
                         const int16x8_t in00_low  = wrapper::vmovl(wrapper::vgetlow(in00));
                         const int16x8_t in00_high = wrapper::vmovl(wrapper::vgethigh(in00));
 
-                        const auto in00_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)), vscale_in);
-                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)), vscale_in);
-                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)), vscale_in);
-                        const auto in00_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)), vscale_in);
+                        const auto in00_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in00_low)), voffset_in)),
+                            vscale_in);
+                        const auto in00_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in00_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in00_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in00_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in00_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in00_high)), voffset_in)),
+                                          vscale_in);
 
                         const int16x8_t in01_low  = wrapper::vmovl(wrapper::vgetlow(in01));
                         const int16x8_t in01_high = wrapper::vmovl(wrapper::vgethigh(in01));
 
-                        const auto in01_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)), vscale_in);
-                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)), vscale_in);
-                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)), vscale_in);
-                        const auto in01_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)), vscale_in);
+                        const auto in01_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in01_low)), voffset_in)),
+                            vscale_in);
+                        const auto in01_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in01_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in01_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in01_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in01_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in01_high)), voffset_in)),
+                                          vscale_in);
 
                         const int16x8_t in10_low  = wrapper::vmovl(wrapper::vgetlow(in10));
                         const int16x8_t in10_high = wrapper::vmovl(wrapper::vgethigh(in10));
 
-                        const auto in10_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)), vscale_in);
-                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)), vscale_in);
-                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)), vscale_in);
-                        const auto in10_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)), vscale_in);
+                        const auto in10_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in10_low)), voffset_in)),
+                            vscale_in);
+                        const auto in10_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in10_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in10_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in10_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in10_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in10_high)), voffset_in)),
+                                          vscale_in);
 
                         const int16x8_t in11_low  = wrapper::vmovl(wrapper::vgetlow(in11));
                         const int16x8_t in11_high = wrapper::vmovl(wrapper::vgethigh(in11));
 
-                        const auto in11_0 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)), vscale_in);
-                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)), vscale_in);
-                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)), vscale_in);
-                        const auto in11_3 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)), vscale_in);
+                        const auto in11_0 = wrapper::vmul(
+                            wrapper::vcvt<float>(wrapper::vsub(wrapper::vmovl(wrapper::vgetlow(in11_low)), voffset_in)),
+                            vscale_in);
+                        const auto in11_1 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgethigh(in11_low)), voffset_in)),
+                                                          vscale_in);
+                        const auto in11_2 = wrapper::vmul(wrapper::vcvt<float>(wrapper::vsub(
+                                                              wrapper::vmovl(wrapper::vgetlow(in11_high)), voffset_in)),
+                                                          vscale_in);
+                        const auto in11_3 =
+                            wrapper::vmul(wrapper::vcvt<float>(
+                                              wrapper::vsub(wrapper::vmovl(wrapper::vgethigh(in11_high)), voffset_in)),
+                                          vscale_in);
 
                         auto out_0 = wrapper::vmul(in00_0, s00);
                         out_0      = wrapper::vmla(out_0, in01_0, s01);
@@ -263,14 +313,16 @@
                         const auto out_2_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_2, invvscale_o));
                         const auto out_3_int = wrapper::vcvt<int32_t>(wrapper::vmla(voffset_o, out_3, invvscale_o));
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
-                        const auto low_part  = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
-                        const auto high_part = wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
-                        const auto out       = wrapper::vcombine(low_part, high_part);
+                        const auto low_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_0_int), wrapper::vqmovn(out_1_int)));
+                        const auto high_part =
+                            wrapper::vqmovn(wrapper::vcombine(wrapper::vqmovn(out_2_int), wrapper::vqmovn(out_3_int)));
+                        const auto out = wrapper::vcombine(low_part, high_part);
 
                         wrapper::vstore(out_ptr_xo_yo + cout * sizeof(int8_t), out);
                     }
 
-                    for(; cout < out_dim_ch; ++cout)
+                    for (; cout < out_dim_ch; ++cout)
                     {
                         const int8_t in00 = *(in_ptr_xi0_yi0 + cout * sizeof(int8_t));
                         const int8_t in01 = *(in_ptr_xi1_yi0 + cout * sizeof(int8_t));
@@ -291,7 +343,8 @@
 #if defined(__aarch64__) && !defined(BARE_METAL)
                         *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info);
 #else  // defined(__aarch64__) && !defined(BARE_METAL)
-                        *(out_ptr_xo_yo + cout * sizeof(int8_t)) = quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO);
+                        *(out_ptr_xo_yo + cout * sizeof(int8_t)) =
+                            quantize_qasymm8_signed(out, oq_info, RoundingPolicy::TO_ZERO);
 #endif // defined(__aarch64__) && !defined(BARE_METAL)
                     }
                 }
@@ -303,28 +356,39 @@
         ARM_COMPUTE_ERROR("Not implemented");
     }
 }
-}
+} // namespace
 namespace cpu
 {
-void qasymm8_signed_neon_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                               InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                               bool align_corners, const Window &window)
+void qasymm8_signed_neon_scale(const ITensor      *src,
+                               ITensor            *dst,
+                               const ITensor      *offsets,
+                               const ITensor      *dx,
+                               const ITensor      *dy,
+                               InterpolationPolicy policy,
+                               BorderMode          border_mode,
+                               PixelValue          constant_border_value,
+                               float               sampling_offset,
+                               bool                align_corners,
+                               const Window       &window)
 {
-    if(policy == InterpolationPolicy::BILINEAR)
+    if (policy == InterpolationPolicy::BILINEAR)
     {
-        if(src->info()->quantization_info() == dst->info()->quantization_info() && border_mode == BorderMode::REPLICATE)
+        if (src->info()->quantization_info() == dst->info()->quantization_info() &&
+            border_mode == BorderMode::REPLICATE)
         {
-            s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            s8_neon_scale(src, dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
+                          align_corners, window);
         }
         else
         {
-            qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value, sampling_offset, align_corners, window);
+            qasymm8_signed_neon_scale_bilinear(src, dst, offsets, dx, dy, border_mode, constant_border_value,
+                                               sampling_offset, align_corners, window);
         }
     }
-    else if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    else if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         nearest_neon_scale<int8_t>(src, dst, offsets, sampling_offset, align_corners, window);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/scale/sve/fp16.cpp b/src/cpu/kernels/scale/sve/fp16.cpp
index ceda19f..cb28f4c 100644
--- a/src/cpu/kernels/scale/sve/fp16.cpp
+++ b/src/cpu/kernels/scale/sve/fp16.cpp

@@ -27,9 +27,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -41,8 +42,12 @@
 {
 namespace
 {
-void fp16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
+void fp16_sve_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -61,38 +66,50 @@
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const float16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<float16_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_f16(pg, out_ptr + x, svld1_f16(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        out);
 }
-}
+} // namespace
 namespace cpu
 {
-void fp16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
+void fp16_sve_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         fp16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
@@ -103,4 +120,4 @@
 }
 } // namespace cpu
 } // namespace arm_compute
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/scale/sve/fp32.cpp b/src/cpu/kernels/scale/sve/fp32.cpp
index f3472f1..cbb345e 100644
--- a/src/cpu/kernels/scale/sve/fp32.cpp
+++ b/src/cpu/kernels/scale/sve/fp32.cpp

@@ -25,23 +25,27 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
+#include <arm_sve.h>
 #include <cmath>
 #include <cstddef>
 
-#include <arm_sve.h>
-
 namespace arm_compute
 {
 namespace
 {
-void fp32_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                            float sampling_offset, bool align_corners, const Window &window)
+void fp32_sve_scale_nearest(const ITensor *src,
+                            ITensor       *dst,
+                            const ITensor *offsets,
+                            float          sampling_offset,
+                            bool           align_corners,
+                            const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<float *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b32(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const float *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<float *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b32(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b32(), pg));
-    },
-    out);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b32(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_f32(pg, out_ptr + x, svld1_f32(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b32(x, window_end_x);
+            } while (svptest_any(svptrue_b32(), pg));
+        },
+        out);
 }
-}
+} // namespace
 namespace cpu
 {
-void fp32_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                    InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                    bool align_corners, const Window &window)
+void fp32_sve_scale(const ITensor      *src,
+                    ITensor            *dst,
+                    const ITensor      *offsets,
+                    const ITensor      *dx,
+                    const ITensor      *dy,
+                    InterpolationPolicy policy,
+                    BorderMode          border_mode,
+                    PixelValue          constant_border_value,
+                    float               sampling_offset,
+                    bool                align_corners,
+                    const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         fp32_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }

diff --git a/src/cpu/kernels/scale/sve/integer.cpp b/src/cpu/kernels/scale/sve/integer.cpp
index 82c70ee..df950b17 100644
--- a/src/cpu/kernels/scale/sve/integer.cpp
+++ b/src/cpu/kernels/scale/sve/integer.cpp

@@ -25,9 +25,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -39,8 +40,12 @@
 {
 namespace
 {
-void u8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                          float sampling_offset, bool align_corners, const Window &window)
+void u8_sve_scale_nearest(const ITensor *src,
+                          ITensor       *dst,
+                          const ITensor *offsets,
+                          float          sampling_offset,
+                          bool           align_corners,
+                          const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -59,32 +64,40 @@
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
 }
 
-void s16_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                           float sampling_offset, bool align_corners, const Window &window)
+void s16_sve_scale_nearest(const ITensor *src,
+                           ITensor       *dst,
+                           const ITensor *offsets,
+                           float          sampling_offset,
+                           bool           align_corners,
+                           const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -103,38 +116,50 @@
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int16_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b16(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const int16_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<int16_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b16(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b16(), pg));
-    },
-    out);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b16(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_s16(pg, out_ptr + x, svld1_s16(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b16(x, window_end_x);
+            } while (svptest_any(svptrue_b16(), pg));
+        },
+        out);
 }
-}
+} // namespace
 namespace cpu
 {
-void u8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                  InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                  bool align_corners, const Window &window)
+void u8_sve_scale(const ITensor      *src,
+                  ITensor            *dst,
+                  const ITensor      *offsets,
+                  const ITensor      *dx,
+                  const ITensor      *dy,
+                  InterpolationPolicy policy,
+                  BorderMode          border_mode,
+                  PixelValue          constant_border_value,
+                  float               sampling_offset,
+                  bool                align_corners,
+                  const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         u8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }
@@ -144,12 +169,20 @@
     }
 }
 
-void s16_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                   bool align_corners, const Window &window)
+void s16_sve_scale(const ITensor      *src,
+                   ITensor            *dst,
+                   const ITensor      *offsets,
+                   const ITensor      *dx,
+                   const ITensor      *dy,
+                   InterpolationPolicy policy,
+                   BorderMode          border_mode,
+                   PixelValue          constant_border_value,
+                   float               sampling_offset,
+                   bool                align_corners,
+                   const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         s16_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }

diff --git a/src/cpu/kernels/scale/sve/list.h b/src/cpu/kernels/scale/sve/list.h
index b9c3a10..aff741a 100644
--- a/src/cpu/kernels/scale/sve/list.h
+++ b/src/cpu/kernels/scale/sve/list.h

@@ -28,10 +28,10 @@
 {
 namespace cpu
 {
-#define DECLARE_SCALE_KERNEL(func_name)                                                                                         \
-    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,              \
-                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset, \
-                   bool align_corners, const Window &window)
+#define DECLARE_SCALE_KERNEL(func_name)                                                                            \
+    void func_name(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy, \
+                   InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value,           \
+                   float sampling_offset, bool align_corners, const Window &window)
 
 DECLARE_SCALE_KERNEL(fp16_sve_scale);
 DECLARE_SCALE_KERNEL(fp32_sve_scale);

diff --git a/src/cpu/kernels/scale/sve/qasymm8.cpp b/src/cpu/kernels/scale/sve/qasymm8.cpp
index d45a69e..0fc794c 100644
--- a/src/cpu/kernels/scale/sve/qasymm8.cpp
+++ b/src/cpu/kernels/scale/sve/qasymm8.cpp

@@ -25,10 +25,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -40,8 +40,12 @@
 {
 namespace
 {
-void qasymm8_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                               float sampling_offset, bool align_corners, const Window &window)
+void qasymm8_sve_scale_nearest(const ITensor *src,
+                               ITensor       *dst,
+                               const ITensor *offsets,
+                               float          sampling_offset,
+                               bool           align_corners,
+                               const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const uint8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<uint8_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_u8(pg, out_ptr + x, svld1_u8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
 }
-}
+} // namespace
 namespace cpu
 {
-void qasymm8_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                       InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                       bool align_corners, const Window &window)
+void qasymm8_sve_scale(const ITensor      *src,
+                       ITensor            *dst,
+                       const ITensor      *offsets,
+                       const ITensor      *dx,
+                       const ITensor      *dy,
+                       InterpolationPolicy policy,
+                       BorderMode          border_mode,
+                       PixelValue          constant_border_value,
+                       float               sampling_offset,
+                       bool                align_corners,
+                       const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         qasymm8_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }

diff --git a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
index 67bca65..68ea01e 100644
--- a/src/cpu/kernels/scale/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/scale/sve/qasymm8_signed.cpp

@@ -25,10 +25,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
-#include "src/core/helpers/ScaleHelpers.h"
-#include "src/core/helpers/ScaleHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "support/Rounding.h"
 
@@ -40,8 +40,12 @@
 {
 namespace
 {
-void qasymm8_signed_sve_scale_nearest(const ITensor *src, ITensor *dst, const ITensor *offsets,
-                                      float sampling_offset, bool align_corners, const Window &window)
+void qasymm8_signed_sve_scale_nearest(const ITensor *src,
+                                      ITensor       *dst,
+                                      const ITensor *offsets,
+                                      float          sampling_offset,
+                                      bool           align_corners,
+                                      const Window  &window)
 {
     const size_t in_stride_c  = src->info()->dimension(0) + src->info()->padding().left + src->info()->padding().right;
     const size_t in_stride_w  = src->info()->dimension(1) + src->info()->padding().top + src->info()->padding().bottom;
@@ -60,38 +64,50 @@
     const uint8_t     *in_ptr_start        = src->buffer() + src->info()->offset_first_element_in_bytes();
     const unsigned int in_stride_bytes_hwc = src->info()->strides_in_bytes()[3];
 
-    execute_window_loop(win, [&](const Coordinates & id)
-    {
-        const int32_t offset     = *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
-        const auto    in_hi      = static_cast<int>(align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr) : std::floor((id.z() + sampling_offset) * hr));
-        const int     offset_row = in_hi * in_stride_wc;
-        const auto    in_ptr     = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
-        const auto    out_ptr    = reinterpret_cast<int8_t *>(out.ptr());
-
-        // Compute S elements per iteration
-        int      x  = window_start_x;
-        svbool_t pg = svwhilelt_b8(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &id)
         {
-            // Store results
-            svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
+            const int32_t offset =
+                *reinterpret_cast<const int32_t *>(offsets->ptr_to_element(Coordinates(id.y(), id.z()))) * in_stride_c;
+            const auto in_hi = static_cast<int>(
+                align_corners ? utils::rounding::round_half_away_from_zero((id.z() + sampling_offset) * hr)
+                              : std::floor((id.z() + sampling_offset) * hr));
+            const int  offset_row = in_hi * in_stride_wc;
+            const auto in_ptr     = reinterpret_cast<const int8_t *>(in_ptr_start + in_stride_bytes_hwc * id[3]);
+            const auto out_ptr    = reinterpret_cast<int8_t *>(out.ptr());
 
-            x += svcntw();
-            pg = svwhilelt_b8(x, window_end_x);
-        }
-        while(svptest_any(svptrue_b8(), pg));
-    },
-    out);
+            // Compute S elements per iteration
+            int      x  = window_start_x;
+            svbool_t pg = svwhilelt_b8(x, window_end_x);
+            do
+            {
+                // Store results
+                svst1_s8(pg, out_ptr + x, svld1_s8(pg, in_ptr + offset + offset_row + x));
+
+                x += svcntw();
+                pg = svwhilelt_b8(x, window_end_x);
+            } while (svptest_any(svptrue_b8(), pg));
+        },
+        out);
 }
-}
+} // namespace
 namespace cpu
 {
-void qasymm8_signed_sve_scale(const ITensor *src, ITensor *dst, const ITensor *offsets, const ITensor *dx, const ITensor *dy,
-                              InterpolationPolicy policy, BorderMode border_mode, PixelValue constant_border_value, float sampling_offset,
-                              bool align_corners, const Window &window)
+void qasymm8_signed_sve_scale(const ITensor      *src,
+                              ITensor            *dst,
+                              const ITensor      *offsets,
+                              const ITensor      *dx,
+                              const ITensor      *dy,
+                              InterpolationPolicy policy,
+                              BorderMode          border_mode,
+                              PixelValue          constant_border_value,
+                              float               sampling_offset,
+                              bool                align_corners,
+                              const Window       &window)
 {
     ARM_COMPUTE_UNUSED(dx, dy, border_mode, constant_border_value);
-    if(policy == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (policy == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
         qasymm8_signed_sve_scale_nearest(src, dst, offsets, sampling_offset, align_corners, window);
     }

diff --git a/src/cpu/kernels/select/generic/neon/fp16.cpp b/src/cpu/kernels/select/generic/neon/fp16.cpp
index b460213..38a5809 100644
--- a/src/cpu/kernels/select/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/select/generic/neon/fp16.cpp

@@ -23,20 +23,22 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_f16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_16<float16_t, uint16x8_t>(c, x, y, output, window);
 }
-void neon_f16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<float16_t>(c, x, y, output, window);
 }
@@ -45,4 +47,4 @@
 
 } // namespace arm_compute
 
-#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)  && defined(ENABLE_FP16_KERNELS) */
\ No newline at end of file
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)  && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/select/generic/neon/fp32.cpp b/src/cpu/kernels/select/generic/neon/fp32.cpp
index 63fd594..50a80cb 100644
--- a/src/cpu/kernels/select/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/select/generic/neon/fp32.cpp

@@ -22,20 +22,22 @@
  * SOFTWARE.
  */
 
-#include "src/cpu/kernels/select/generic/neon/impl.h"
-
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
+#include "src/cpu/kernels/select/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_f32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_32<float, uint32x4_t>(c, x, y, output, window);
 }
-void neon_f32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_f32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<float>(c, x, y, output, window);
 }

diff --git a/src/cpu/kernels/select/generic/neon/impl.h b/src/cpu/kernels/select/generic/neon/impl.h
index 6a6d996..7ce640b 100644
--- a/src/cpu/kernels/select/generic/neon/impl.h
+++ b/src/cpu/kernels/select/generic/neon/impl.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_CPU_KERNELS_SELECT_GENERIC_NEON_IMPL_H
 
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/NEON/NEAsymm.h"
 #include "src/cpu/kernels/select/generic/neon/impl.h"
 
@@ -37,8 +38,16 @@
 namespace cpu
 {
 template <typename ScalarType, typename VectorType>
-void select_op(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window,
-               const int window_step_x, const int window_start_x, const int window_end_x, const int limit, VectorType (*condition_conversion)(const uint8_t *))
+void select_op(const ITensor *cond,
+               const ITensor *in1,
+               const ITensor *in2,
+               ITensor       *out,
+               const Window  &window,
+               const int      window_step_x,
+               const int      window_start_x,
+               const int      window_end_x,
+               const int      limit,
+               VectorType (*condition_conversion)(const uint8_t *))
 {
     Window win = window;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -48,30 +57,32 @@
     Iterator input2(in2, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
-        const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
-        const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
-        const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
+        {
+            auto       output_ptr    = reinterpret_cast<ScalarType *>(output.ptr());
+            const auto condition_ptr = reinterpret_cast<const uint8_t *>(condition.ptr());
+            const auto input1_ptr    = reinterpret_cast<const ScalarType *>(input1.ptr());
+            const auto input2_ptr    = reinterpret_cast<const ScalarType *>(input2.ptr());
 
-        int x = window_start_x;
-        for(; x <= limit; x += window_step_x)
-        {
-            const auto c = (*condition_conversion)(condition_ptr + x);
-            const auto a = wrapper::vloadq(input1_ptr + x);
-            const auto b = wrapper::vloadq(input2_ptr + x);
-            wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
-        }
-        for(; x < window_end_x; ++x)
-        {
-            const auto c      = *(condition_ptr + x);
-            const auto a      = *(input1_ptr + x);
-            const auto b      = *(input2_ptr + x);
-            *(output_ptr + x) = static_cast<bool>(c) ? a : b;
-        }
-    },
-    condition, input1, input2, output);
+            int x = window_start_x;
+            for (; x <= limit; x += window_step_x)
+            {
+                const auto c = (*condition_conversion)(condition_ptr + x);
+                const auto a = wrapper::vloadq(input1_ptr + x);
+                const auto b = wrapper::vloadq(input2_ptr + x);
+                wrapper::vstore(output_ptr + x, wrapper::vbsl(c, a, b));
+            }
+            for (; x < window_end_x; ++x)
+            {
+                const auto c      = *(condition_ptr + x);
+                const auto a      = *(input1_ptr + x);
+                const auto b      = *(input2_ptr + x);
+                *(output_ptr + x) = static_cast<bool>(c) ? a : b;
+            }
+        },
+        condition, input1, input2, output);
 }
 
 template <typename ScalarType, typename VectorType>
@@ -81,11 +92,14 @@
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
-    });
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint8_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vloadq(condition_ptr), zero);
+        });
 }
 
 template <typename ScalarType, typename VectorType>
@@ -95,11 +109,14 @@
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
-    });
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint16_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vmovl(wrapper::vload(condition_ptr)), zero);
+        });
 }
 
 template <typename ScalarType, typename VectorType>
@@ -109,15 +126,19 @@
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    select_op<ScalarType, VectorType>(cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x, [](const uint8_t *condition_ptr) -> VectorType
-    {
-        static const auto zero = wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
-        return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
-    });
+    select_op<ScalarType, VectorType>(
+        cond, in1, in2, out, window, window_step_x, window_start_x, window_end_x, window_end_x - window_step_x,
+        [](const uint8_t *condition_ptr) -> VectorType
+        {
+            static const auto zero =
+                wrapper::vdup_n(static_cast<uint32_t>(0), arm_compute::wrapper::traits::vector_128_tag());
+            return wrapper::vcgt(wrapper::vmovl(wrapper::vgetlow(wrapper::vmovl(wrapper::vload(condition_ptr)))), zero);
+        });
 }
 
 template <typename ScalarType>
-void select_op_not_same_rank(const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
+void select_op_not_same_rank(
+    const ITensor *cond, const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window)
 {
     ARM_COMPUTE_UNUSED(window);
 
@@ -131,20 +152,20 @@
     int       offset     = 0;
     const int step       = 16 / in1->info()->element_size();
 
-    for(int i = 0; i < outer_size; ++i)
+    for (int i = 0; i < outer_size; ++i)
     {
         int        x         = offset;
         const auto input_ptr = static_cast<bool>(*(condition_ptr + i)) ? input1_ptr : input2_ptr;
-        for(; x <= offset + inner_size - step; x += step)
+        for (; x <= offset + inner_size - step; x += step)
         {
             wrapper::vstore(output_ptr + x, wrapper::vloadq(input_ptr + x));
         }
-        if(x <= offset + inner_size - (step / 2))
+        if (x <= offset + inner_size - (step / 2))
         {
             wrapper::vstore(output_ptr + x, wrapper::vload(input_ptr + x));
             x += step / 2;
         }
-        for(; x < offset + inner_size; ++x)
+        for (; x < offset + inner_size; ++x)
         {
             *(output_ptr + x) = *(input_ptr + x);
         }

diff --git a/src/cpu/kernels/select/generic/neon/integer.cpp b/src/cpu/kernels/select/generic/neon/integer.cpp
index 71b2f0b..135087c 100644
--- a/src/cpu/kernels/select/generic/neon/integer.cpp
+++ b/src/cpu/kernels/select/generic/neon/integer.cpp

@@ -25,59 +25,71 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 
-#include <arm_neon.h>
-
 #include "src/cpu/kernels/select/generic/neon/impl.h"
 
+#include <arm_neon.h>
+
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_s8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s8_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_8<int8_t, uint8x16_t>(c, x, y, output, window);
 }
-void neon_s16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_16<int16_t, uint16x8_t>(c, x, y, output, window);
 }
-void neon_s32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_32<int32_t, uint32x4_t>(c, x, y, output, window);
 }
-void neon_s8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s8_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<int8_t>(c, x, y, output, window);
 }
-void neon_s16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<int16_t>(c, x, y, output, window);
 }
-void neon_s32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_s32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<int32_t>(c, x, y, output, window);
 }
-void neon_u8_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u8_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_8<uint8_t, uint8x16_t>(c, x, y, output, window);
 }
-void neon_u16_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u16_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_16<uint16_t, uint16x8_t>(c, x, y, output, window);
 }
-void neon_u32_select_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u32_select_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_32<uint32_t, uint32x4_t>(c, x, y, output, window);
 }
-void neon_u8_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u8_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<uint8_t>(c, x, y, output, window);
 }
-void neon_u16_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u16_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<uint16_t>(c, x, y, output, window);
 }
-void neon_u32_select_not_same_rank(const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
+void neon_u32_select_not_same_rank(
+    const ITensor *c, const ITensor *x, const ITensor *y, ITensor *output, const Window &window)
 {
     return select_op_not_same_rank<uint32_t>(c, x, y, output, window);
 }

diff --git a/src/cpu/kernels/softmax/generic/neon/fp16.cpp b/src/cpu/kernels/softmax/generic/neon/fp16.cpp
index f655669..2e2adf3 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp16.cpp

@@ -23,6 +23,7 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
@@ -30,8 +31,13 @@
 {
 namespace cpu
 {
-void neon_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                       ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp16_softmax(const ITensor *in,
+                       const ITensor *max,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       bool           is_log,
+                       const Window  &window)
 {
     return neon_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -40,6 +46,6 @@
 {
     return neon_logits_1d_max<float16_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)

diff --git a/src/cpu/kernels/softmax/generic/neon/fp32.cpp b/src/cpu/kernels/softmax/generic/neon/fp32.cpp
index ddd270a..61df40c 100644
--- a/src/cpu/kernels/softmax/generic/neon/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/fp32.cpp

@@ -22,14 +22,20 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                       ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_fp32_softmax(const ITensor *in,
+                       const ITensor *max,
+                       void *const    tmp,
+                       ITensor       *out,
+                       const float    beta,
+                       bool           is_log,
+                       const Window  &window)
 {
     return neon_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
 }
@@ -38,5 +44,5 @@
 {
     return neon_logits_1d_max<float>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/neon/impl.cpp b/src/cpu/kernels/softmax/generic/neon/impl.cpp
index f07fd2f..5d6e6a4 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/impl.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
+
 #include "support/SaturateCast.h"
 
 namespace arm_compute
@@ -32,11 +33,10 @@
 template void neon_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
 
 template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window)
+void neon_softmax_logits_1d_quantized(
+    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 {
-    static_assert(std::is_same<T, qasymm8_t>::value
-                  || std::is_same<T, qasymm8_signed_t>::value,
+    static_assert(std::is_same<T, qasymm8_t>::value || std::is_same<T, qasymm8_signed_t>::value,
                   "quantized type should be either qasymm8_t or qasymm8_signed_t.");
 
     const int start_x     = in->info()->valid_region().anchor.x();
@@ -50,163 +50,174 @@
     Iterator      out_it(out, window);
     constexpr int vec_size = 16;
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-        float sum_inversed{};
-
-        /* Compute exponentials and sum */
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
 
-            /* Init sum to zero */
-            float32x4x4_t vec_sum =
+            float sum{};
+            float sum_inversed{};
+
+            /* Compute exponentials and sum */
             {
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-                vdupq_n_f32(0.f),
-            };
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+                const auto vec_max = wrapper::vdup_n(max_val, wrapper::traits::vector_128_tag{});
 
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_elements     = wrapper::vloadq(in_ptr + x);
-                vec_elements          = wrapper::vqsub(vec_max, vec_elements);
-                auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
+                /* Init sum to zero */
+                float32x4x4_t vec_sum = {
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                    vdupq_n_f32(0.f),
+                };
 
-                if(is_log)
+                /* Loop over row and compute exponentials and sum */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
-                    vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
-                    vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
-                    vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
-                }
-                else
-                {
-                    vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
-                    vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
-                    vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
-                    vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
-                    vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
-                    vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
-                    vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
-                    vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
-                }
+                    auto vec_elements     = wrapper::vloadq(in_ptr + x);
+                    vec_elements          = wrapper::vqsub(vec_max, vec_elements);
+                    auto vec_elements_flt = convert_int_to_float<float32x4x4_t>(vec_elements);
 
-                vst4q_f32(tmp_ptr + x, vec_elements_flt);
-            }
-
-            /* Reduce sum */
-            const auto sum_16_byte = vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
-            auto       sum_res     = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
-            sum_res                = vpadd_f32(sum_res, sum_res);
-            sum                    = wrapper::vgetlane(sum_res, 0);
-
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                float element{};
-                if(is_log)
-                {
-                    element = (max_val - in_ptr[x]) * scale_beta;
-                    sum += std::exp(element);
-                }
-                else
-                {
-                    element = std::exp((max_val - in_ptr[x]) * scale_beta);
-                    sum += element;
-                }
-
-                tmp_ptr[x] = element;
-            }
-
-            if(!is_log)
-            {
-                sum_inversed = 256.f / sum;
-            }
-            else
-            {
-                sum = std::log(sum);
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
-                float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
-                int_vec_type  normalized_value{};
-                if(is_log)
-                {
-                    const float32x4x4_t sub =
+                    if (is_log)
                     {
-                        vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
-                        vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
-                    };
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
-                }
-                else
-                {
-                    float32x4x4_t mul =
+                        vec_elements_flt.val[0] = vmulq_f32(vec_elements_flt.val[0], scale_beta_vec);
+                        vec_elements_flt.val[1] = vmulq_f32(vec_elements_flt.val[1], scale_beta_vec);
+                        vec_elements_flt.val[2] = vmulq_f32(vec_elements_flt.val[2], scale_beta_vec);
+                        vec_elements_flt.val[3] = vmulq_f32(vec_elements_flt.val[3], scale_beta_vec);
+                        vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vexpq_f32(vec_elements_flt.val[0]));
+                        vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vexpq_f32(vec_elements_flt.val[1]));
+                        vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vexpq_f32(vec_elements_flt.val[2]));
+                        vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vexpq_f32(vec_elements_flt.val[3]));
+                    }
+                    else
                     {
-                        vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
-                        vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
-                    };
-
-                    if(is_qasymm8_signed)
-                    {
-                        const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
-                        mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
-                        mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
-                        mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
-                        mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
+                        vec_elements_flt.val[0] = vexpq_f32(vmulq_f32(vec_elements_flt.val[0], scale_beta_vec));
+                        vec_elements_flt.val[1] = vexpq_f32(vmulq_f32(vec_elements_flt.val[1], scale_beta_vec));
+                        vec_elements_flt.val[2] = vexpq_f32(vmulq_f32(vec_elements_flt.val[2], scale_beta_vec));
+                        vec_elements_flt.val[3] = vexpq_f32(vmulq_f32(vec_elements_flt.val[3], scale_beta_vec));
+                        vec_sum.val[0]          = vaddq_f32(vec_sum.val[0], vec_elements_flt.val[0]);
+                        vec_sum.val[1]          = vaddq_f32(vec_sum.val[1], vec_elements_flt.val[1]);
+                        vec_sum.val[2]          = vaddq_f32(vec_sum.val[2], vec_elements_flt.val[2]);
+                        vec_sum.val[3]          = vaddq_f32(vec_sum.val[3], vec_elements_flt.val[3]);
                     }
 
-                    normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
+                    vst4q_f32(tmp_ptr + x, vec_elements_flt);
                 }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
+
+                /* Reduce sum */
+                const auto sum_16_byte =
+                    vaddq_f32(vaddq_f32(vec_sum.val[0], vec_sum.val[1]), vaddq_f32(vec_sum.val[2], vec_sum.val[3]));
+                auto sum_res = vpadd_f32(vget_high_f32(sum_16_byte), vget_low_f32(sum_16_byte));
+                sum_res      = vpadd_f32(sum_res, sum_res);
+                sum          = wrapper::vgetlane(sum_res, 0);
+
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
                 {
-                    out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+                    float element{};
+                    if (is_log)
+                    {
+                        element = (max_val - in_ptr[x]) * scale_beta;
+                        sum += std::exp(element);
+                    }
+                    else
+                    {
+                        element = std::exp((max_val - in_ptr[x]) * scale_beta);
+                        sum += element;
+                    }
+
+                    tmp_ptr[x] = element;
+                }
+
+                if (!is_log)
+                {
+                    sum_inversed = 256.f / sum;
                 }
                 else
                 {
-                    out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) - (is_qasymm8_signed ? 128.f : 0));
+                    sum = std::log(sum);
                 }
             }
-        }
-    },
-    in_it, max_it, out_it);
+
+            /* Normalize exponentials */
+            {
+                constexpr bool is_qasymm8_signed = std::is_same<T, qasymm8_signed_t>::value;
+                /* Loop over row and compute softmax */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
+                {
+                    using int_vec_type   = wrapper::traits::neon_vector_t<T, 16>;
+                    float32x4x4_t vec_in = vld4q_f32(tmp_ptr + x);
+                    int_vec_type  normalized_value{};
+                    if (is_log)
+                    {
+                        const float32x4x4_t sub = {
+                            vsubq_f32(vec_in.val[0], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[1], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[2], vdupq_n_f32(sum)),
+                            vsubq_f32(vec_in.val[3], vdupq_n_f32(sum)),
+                        };
+                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(sub);
+                    }
+                    else
+                    {
+                        float32x4x4_t mul = {
+                            vmulq_f32(vec_in.val[0], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[1], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[2], vdupq_n_f32(sum_inversed)),
+                            vmulq_f32(vec_in.val[3], vdupq_n_f32(sum_inversed)),
+                        };
+
+                        if (is_qasymm8_signed)
+                        {
+                            const auto offset_vec = wrapper::vdup_n(128.f, wrapper::traits::vector_128_tag{});
+                            mul.val[0]            = wrapper::vsub(mul.val[0], offset_vec);
+                            mul.val[1]            = wrapper::vsub(mul.val[1], offset_vec);
+                            mul.val[2]            = wrapper::vsub(mul.val[2], offset_vec);
+                            mul.val[3]            = wrapper::vsub(mul.val[3], offset_vec);
+                        }
+
+                        normalized_value = convert_float_to_int<float32x4x4_t, int_vec_type>(mul);
+                    }
+                    wrapper::vstore(out_ptr + x, normalized_value);
+                }
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    if (is_log)
+                    {
+                        out_ptr[x] = utils::cast::saturate_cast<T>(tmp_ptr[x] - sum);
+                    }
+                    else
+                    {
+                        out_ptr[x] = utils::cast::saturate_cast<T>((tmp_ptr[x] * sum_inversed) -
+                                                                   (is_qasymm8_signed ? 128.f : 0));
+                    }
+                }
+            }
+        },
+        in_it, max_it, out_it);
 }
 
-template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                                 ITensor *out, float beta, bool is_log, const Window &window);
-template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                          ITensor *out, float beta, bool is_log, const Window &window);
+template void neon_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+                                                                 const ITensor *max,
+                                                                 void *const    tmp,
+                                                                 ITensor       *out,
+                                                                 float          beta,
+                                                                 bool           is_log,
+                                                                 const Window  &window);
+template void neon_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+                                                          const ITensor *max,
+                                                          void *const    tmp,
+                                                          ITensor       *out,
+                                                          float          beta,
+                                                          bool           is_log,
+                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/neon/impl.h b/src/cpu/kernels/softmax/generic/neon/impl.h
index 206d36a..4d9b789 100644
--- a/src/cpu/kernels/softmax/generic/neon/impl.h
+++ b/src/cpu/kernels/softmax/generic/neon/impl.h

@@ -25,6 +25,7 @@
 #define SRC_CORE_NEON_KERNELS_SOFTMAX_IMPL_H
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/core/NEON/NEMath.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 
@@ -42,53 +43,65 @@
     const auto    window_start_x = static_cast<int>(window.x().start());
     const auto    window_end_x   = static_cast<int>(window.x().end());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(in, win);
     Iterator output(out, win);
 
     const int sum_stages = log2(window_step_x / 2);
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<T *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
-        int  x       = window_start_x;
-
-        for(; x <= (window_end_x - window_step_x); x += window_step_x)
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto current_value = wrapper::vloadq(in_ptr + x);
-            vec_max                  = wrapper::vmax(vec_max, current_value);
-        }
-        auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
+            // Get pointers
+            const auto in_ptr  = reinterpret_cast<const T *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<T *>(output.ptr());
 
-        for(int i = 0; i < sum_stages; ++i)
-        {
-            carry_max = wrapper::vpmax(carry_max, carry_max);
-        }
-        T max_val = wrapper::vgetlane(carry_max, 0);
+            // Init max value
+            auto vec_max = wrapper::vdup_n(support::cpp11::lowest<T>(), ExactTagType{});
+            int  x       = window_start_x;
 
-        // Compute left-over elements
-        for(; x < window_end_x; ++x)
-        {
-            max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
-        }
+            for (; x <= (window_end_x - window_step_x); x += window_step_x)
+            {
+                const auto current_value = wrapper::vloadq(in_ptr + x);
+                vec_max                  = wrapper::vmax(vec_max, current_value);
+            }
+            auto carry_max = wrapper::vpmax(wrapper::vgethigh(vec_max), wrapper::vgetlow(vec_max));
 
-        *out_ptr = max_val;
-    },
-    input, output);
+            for (int i = 0; i < sum_stages; ++i)
+            {
+                carry_max = wrapper::vpmax(carry_max, carry_max);
+            }
+            T max_val = wrapper::vgetlane(carry_max, 0);
+
+            // Compute left-over elements
+            for (; x < window_end_x; ++x)
+            {
+                max_val = *(in_ptr + x) > max_val ? *(in_ptr + x) : max_val;
+            }
+
+            *out_ptr = max_val;
+        },
+        input, output);
 }
 
 template <typename T>
-void neon_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window);
+void neon_softmax_logits_1d_quantized(const ITensor *in,
+                                      const ITensor *max,
+                                      void *const    tmp,
+                                      ITensor       *out,
+                                      float          beta,
+                                      bool           is_log,
+                                      const Window  &window);
 
 template <typename T>
-void neon_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                  ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_softmax_logits_1d_float(const ITensor *in,
+                                  const ITensor *max,
+                                  void *const    tmp,
+                                  ITensor       *out,
+                                  const float    beta,
+                                  bool           is_log,
+                                  const Window  &window)
 {
     const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
@@ -103,113 +116,118 @@
     constexpr int vec_size   = 16 / sizeof(T);
     const int     sum_stages = log2(vec_size / 2);
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<T *>(tmp);
-
-        T sum{};
-        T sum_inversed{};
-
-        /* Compute exponentials and sum */
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
-            const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const T *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<T *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<T *>(tmp);
 
-            /* Init sum to zero */
-            auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+            T sum{};
+            T sum_inversed{};
 
-            /* Loop over row and compute exponentials and sum */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
+            /* Compute exponentials and sum */
             {
-                auto vec_elements = wrapper::vloadq(in_ptr + x);
-                vec_elements      = wrapper::vsub(vec_elements, vec_max);
-                if(is_log)
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const T *>(max_it.ptr());
+                const auto vec_max = wrapper::vdup_n(max_val, ExactTagType{});
+
+                /* Init sum to zero */
+                auto vec_sum = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+
+                /* Loop over row and compute exponentials and sum */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    vec_elements = wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
-                    vec_sum      = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                    auto vec_elements = wrapper::vloadq(in_ptr + x);
+                    vec_elements      = wrapper::vsub(vec_elements, vec_max);
+                    if (is_log)
+                    {
+                        vec_elements =
+                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{}));
+                        vec_sum = wrapper::vadd(vec_sum, wrapper::vexpq(vec_elements));
+                    }
+                    else
+                    {
+                        vec_elements = wrapper::vexpq(
+                            wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
+                        vec_sum = wrapper::vadd(vec_sum, vec_elements);
+                    }
+                    wrapper::vstore(tmp_ptr + x, vec_elements);
+                }
+
+                /* Reduce sum */
+                auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
+                for (int i = 0; i < sum_stages; ++i)
+                {
+                    sum_res = wrapper::vpadd(sum_res, sum_res);
+                }
+                sum = wrapper::vgetlane(sum_res, 0);
+
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
+                {
+                    T element{};
+
+                    if (is_log)
+                    {
+                        element = (in_ptr[x] - max_val) * beta;
+                        sum += std::exp(element);
+                    }
+                    else
+                    {
+                        element = std::exp((in_ptr[x] - max_val) * beta);
+                        sum += element;
+                    }
+                    tmp_ptr[x] = element;
+                }
+
+                if (!is_log)
+                {
+                    sum_inversed = T(1) / sum;
                 }
                 else
                 {
-                    vec_elements = wrapper::vexpq(wrapper::vmul(vec_elements, wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})));
-                    vec_sum      = wrapper::vadd(vec_sum, vec_elements);
+                    sum = static_cast<T>(std::log(sum));
                 }
-                wrapper::vstore(tmp_ptr + x, vec_elements);
             }
 
-            /* Reduce sum */
-            auto sum_res = wrapper::vpadd(wrapper::vgethigh(vec_sum), wrapper::vgetlow(vec_sum));
-            for(int i = 0; i < sum_stages; ++i)
+            /* Normalize exponentials */
             {
-                sum_res = wrapper::vpadd(sum_res, sum_res);
-            }
-            sum = wrapper::vgetlane(sum_res, 0);
-
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                T element{};
-
-                if(is_log)
+                /* Loop over row and compute softmax */
+                int x = 0;
+                for (; x <= (input_width - vec_size); x += vec_size)
                 {
-                    element = (in_ptr[x] - max_val) * beta;
-                    sum += std::exp(element);
+                    auto vec_in           = wrapper::vloadq(tmp_ptr + x);
+                    auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
+                    if (is_log)
+                    {
+                        normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
+                    }
+                    else
+                    {
+                        normalized_value =
+                            wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
+                    }
+                    wrapper::vstore(out_ptr + x, normalized_value);
                 }
-                else
+                /* Run remaining elements */
+                for (; x < input_width; ++x)
                 {
-                    element = std::exp((in_ptr[x] - max_val) * beta);
-                    sum += element;
-                }
-                tmp_ptr[x] = element;
-            }
-
-            if(!is_log)
-            {
-                sum_inversed = T(1) / sum;
-            }
-            else
-            {
-                sum = static_cast<T>(std::log(sum));
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int x = 0;
-            for(; x <= (input_width - vec_size); x += vec_size)
-            {
-                auto vec_in           = wrapper::vloadq(tmp_ptr + x);
-                auto normalized_value = wrapper::vdup_n(static_cast<T>(0), ExactTagType{});
-                if(is_log)
-                {
-                    normalized_value = wrapper::vsub(vec_in, wrapper::vdup_n(static_cast<T>(sum), ExactTagType{}));
-                }
-                else
-                {
-                    normalized_value = wrapper::vmul(vec_in, wrapper::vdup_n(static_cast<T>(sum_inversed), ExactTagType{}));
-                }
-                wrapper::vstore(out_ptr + x, normalized_value);
-            }
-            /* Run remaining elements */
-            for(; x < input_width; ++x)
-            {
-                if(is_log)
-                {
-                    out_ptr[x] = tmp_ptr[x] - sum;
-                }
-                else
-                {
-                    out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                    if (is_log)
+                    {
+                        out_ptr[x] = tmp_ptr[x] - sum;
+                    }
+                    else
+                    {
+                        out_ptr[x] = tmp_ptr[x] * sum_inversed;
+                    }
                 }
             }
-        }
-    },
-    in_it, max_it, out_it);
+        },
+        in_it, max_it, out_it);
 }
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
index a572891..40713dc 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8.cpp

@@ -22,14 +22,20 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                          ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_softmax(const ITensor *in,
+                          const ITensor *max,
+                          void *const    tmp,
+                          ITensor       *out,
+                          const float    beta,
+                          bool           is_log,
+                          const Window  &window)
 {
     return neon_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -38,5 +44,5 @@
 {
     return neon_logits_1d_max<qasymm8_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
index 7d3fe6e..2c5e284 100644
--- a/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/neon/qasymm8_signed.cpp

@@ -22,14 +22,20 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void neon_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
+void neon_qasymm8_signed_softmax(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
 {
     return neon_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -38,5 +44,5 @@
 {
     return neon_logits_1d_max<qasymm8_signed_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/sve/fp16.cpp b/src/cpu/kernels/softmax/generic/sve/fp16.cpp
index 15a523b..5e94f72 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp16.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp16.cpp

@@ -23,14 +23,20 @@
  */
 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/CpuTypes.h"
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp16_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                      ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp16_softmax(const ITensor *in,
+                      const ITensor *max,
+                      void *const    tmp,
+                      ITensor       *out,
+                      const float    beta,
+                      bool           is_log,
+                      const Window  &window)
 {
     return sve_softmax_logits_1d_float<float16_t>(in, max, tmp, out, beta, is_log, window);
 }
@@ -39,6 +45,6 @@
 {
     return sve_logits_1d_max<float16_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute
 #endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */

diff --git a/src/cpu/kernels/softmax/generic/sve/fp32.cpp b/src/cpu/kernels/softmax/generic/sve/fp32.cpp
index 55c4aee..d692cc2 100644
--- a/src/cpu/kernels/softmax/generic/sve/fp32.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/fp32.cpp

@@ -23,14 +23,20 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve_fp32_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                      ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_fp32_softmax(const ITensor *in,
+                      const ITensor *max,
+                      void *const    tmp,
+                      ITensor       *out,
+                      const float    beta,
+                      bool           is_log,
+                      const Window  &window)
 {
     return sve_softmax_logits_1d_float<float>(in, max, tmp, out, beta, is_log, window);
 }
@@ -39,5 +45,5 @@
 {
     return sve_logits_1d_max<float>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/sve/impl.cpp b/src/cpu/kernels/softmax/generic/sve/impl.cpp
index 2340a31..24f1bb8 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/impl.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
+
 #include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
@@ -36,42 +37,48 @@
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
 
-    Window win{ window };
+    Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
     Iterator input(in, win);
     Iterator output(out, win);
 
-    execute_window_loop(win, [&](const Coordinates &)
-    {
-        // Get pointers
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
-        const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
-
-        // Init max value
-        auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
-
-        int      x  = window_start_x;
-        svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        do
+    execute_window_loop(
+        win,
+        [&](const Coordinates &)
         {
-            const auto current_value = svld1(pg, in_ptr + x);
-            vec_max                  = svmax_m(pg, vec_max, current_value);
+            // Get pointers
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(input.ptr());
+            const auto out_ptr = reinterpret_cast<ScalarType *>(output.ptr());
 
-            x += wrapper::svcnt<ScalarType>();
-            pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
-        }
-        while(svptest_any(all_true_pg, pg));
+            // Init max value
+            auto vec_max = wrapper::svdup_n(support::cpp11::lowest<ScalarType>());
 
-        auto max_val = svmaxv(all_true_pg, vec_max);
+            int      x  = window_start_x;
+            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            do
+            {
+                const auto current_value = svld1(pg, in_ptr + x);
+                vec_max                  = svmax_m(pg, vec_max, current_value);
 
-        *out_ptr = max_val;
-    },
-    input, output);
+                x += wrapper::svcnt<ScalarType>();
+                pg = wrapper::svwhilelt<ScalarType>(x, window_end_x);
+            } while (svptest_any(all_true_pg, pg));
+
+            auto max_val = svmaxv(all_true_pg, vec_max);
+
+            *out_ptr = max_val;
+        },
+        input, output);
 }
 
 template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
+void sve_softmax_logits_1d_float(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
 {
     const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
@@ -82,88 +89,88 @@
 
     const auto all_true_pg = wrapper::svptrue<ScalarType>();
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
-
-        ScalarType sum{ 0 };
-
-        /* Compute exponentials and sum */
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            /* Get max value */
-            const auto max_val  = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max  = wrapper::svdup_n(max_val);
-            const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<ScalarType *>(tmp);
 
-            /* Init sum to zero */
-            auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+            ScalarType sum{0};
 
-            /* Loop over row and compute exponentials and sum */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
+            /* Compute exponentials and sum */
             {
-                auto vec_elements = svld1(pg, in_ptr + x);
-                vec_elements      = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
-                if(!is_log)
+                /* Get max value */
+                const auto max_val  = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+                const auto vec_max  = wrapper::svdup_n(max_val);
+                const auto vec_beta = wrapper::svdup_n(static_cast<ScalarType>(beta));
+
+                /* Init sum to zero */
+                auto vec_sum = wrapper::svdup_n(static_cast<ScalarType>(0));
+
+                /* Loop over row and compute exponentials and sum */
+                int      x  = 0;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                do
                 {
-                    vec_elements = wrapper::svexp_z(pg, vec_elements);
-                    vec_sum      = svadd_m(pg, vec_sum, vec_elements);
-                }
-                svst1(pg, tmp_ptr + x, vec_elements);
+                    auto vec_elements = svld1(pg, in_ptr + x);
+                    vec_elements      = svmul_z(pg, svsub_z(pg, vec_elements, vec_max), vec_beta);
+                    if (!is_log)
+                    {
+                        vec_elements = wrapper::svexp_z(pg, vec_elements);
+                        vec_sum      = svadd_m(pg, vec_sum, vec_elements);
+                    }
+                    svst1(pg, tmp_ptr + x, vec_elements);
 
-                if(is_log)
+                    if (is_log)
+                    {
+                        vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
+                    }
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                } while (svptest_any(all_true_pg, pg));
+
+                /* Reduce sum */
+                sum = svaddv(all_true_pg, vec_sum);
+
+                if (is_log)
                 {
-                    vec_sum = svadd_m(pg, vec_sum, wrapper::svexp_z(pg, vec_elements));
-                }
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            }
-            while(svptest_any(all_true_pg, pg));
-
-            /* Reduce sum */
-            sum = svaddv(all_true_pg, vec_sum);
-
-            if(is_log)
-            {
-                sum = static_cast<ScalarType>(std::log(sum));
-            }
-            else
-            {
-                sum = ScalarType(1) / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            /* Loop over row and compute softmax */
-            int      x  = 0;
-            svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
-            do
-            {
-                auto vec_in           = svld1(pg, tmp_ptr + x);
-                auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
-                if(is_log)
-                {
-                    normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    sum = static_cast<ScalarType>(std::log(sum));
                 }
                 else
                 {
-                    normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    sum = ScalarType(1) / sum;
                 }
-                svst1(pg, out_ptr + x, normalized_value);
-
-                x += wrapper::svcnt<ScalarType>();
-                pg = wrapper::svwhilelt<ScalarType>(x, input_width);
             }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
+
+            /* Normalize exponentials */
+            {
+                /* Loop over row and compute softmax */
+                int      x  = 0;
+                svbool_t pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                do
+                {
+                    auto vec_in           = svld1(pg, tmp_ptr + x);
+                    auto normalized_value = wrapper::svdup_n(static_cast<ScalarType>(0));
+                    if (is_log)
+                    {
+                        normalized_value = svsub_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    }
+                    else
+                    {
+                        normalized_value = svmul_z(pg, vec_in, wrapper::svdup_n(static_cast<ScalarType>(sum)));
+                    }
+                    svst1(pg, out_ptr + x, normalized_value);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg = wrapper::svwhilelt<ScalarType>(x, input_width);
+                } while (svptest_any(all_true_pg, pg));
+            }
+        },
+        in_it, max_it, out_it);
 }
 
 template void sve_logits_1d_max<float>(const ITensor *in, ITensor *out, const Window &window);
@@ -171,9 +178,19 @@
 template void sve_logits_1d_max<qasymm8_t>(const ITensor *in, ITensor *out, const Window &window);
 template void sve_logits_1d_max<qasymm8_signed_t>(const ITensor *in, ITensor *out, const Window &window);
 
-template void sve_softmax_logits_1d_float<float>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                 ITensor *out, const float beta, bool is_log, const Window &window);
-template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                     ITensor *out, const float beta, bool is_log, const Window &window);
+template void sve_softmax_logits_1d_float<float>(const ITensor *in,
+                                                 const ITensor *max,
+                                                 void *const    tmp,
+                                                 ITensor       *out,
+                                                 const float    beta,
+                                                 bool           is_log,
+                                                 const Window  &window);
+template void sve_softmax_logits_1d_float<float16_t>(const ITensor *in,
+                                                     const ITensor *max,
+                                                     void *const    tmp,
+                                                     ITensor       *out,
+                                                     const float    beta,
+                                                     bool           is_log,
+                                                     const Window  &window);
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/sve/impl.h b/src/cpu/kernels/softmax/generic/sve/impl.h
index 4f76ec6..89a30d0 100644
--- a/src/cpu/kernels/softmax/generic/sve/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve/impl.h

@@ -33,8 +33,13 @@
 void sve_logits_1d_max(const ITensor *in, ITensor *out, const Window &window);
 
 template <typename ScalarType>
-void sve_softmax_logits_1d_float(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window);
+void sve_softmax_logits_1d_float(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
 

diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
index e9044d5..85e5ccf 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 
 namespace arm_compute
@@ -33,5 +34,5 @@
 {
     return sve_logits_1d_max<qasymm8_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
index ab45ce5..4be2e2e 100644
--- a/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve/qasymm8_signed.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve/impl.h"
 
 namespace arm_compute
@@ -33,5 +34,5 @@
 {
     return sve_logits_1d_max<qasymm8_signed_t>(in, out, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.cpp b/src/cpu/kernels/softmax/generic/sve2/impl.cpp
index 8f677c6..98b2f51 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.cpp

@@ -23,7 +23,9 @@
  */
 
 #include "src/cpu/kernels/softmax/generic/sve2/impl.h"
+
 #include "arm_compute/core/Types.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
@@ -31,8 +33,8 @@
 namespace cpu
 {
 template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window)
+void sve2_softmax_logits_1d_quantized(
+    const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, float beta, bool is_log, const Window &window)
 {
     const int start_x     = in->info()->valid_region().anchor.x();
     const int input_width = in->info()->valid_region().shape.x();
@@ -50,162 +52,173 @@
     const int inc_2 = static_cast<int>(2 * svcntw());
     const int inc_3 = static_cast<int>(3 * svcntw());
 
-    execute_window_loop(window, [&](const Coordinates &)
-    {
-        /* Get pointers */
-        const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
-        const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
-        const auto tmp_ptr = reinterpret_cast<float *>(tmp);
-
-        float sum{};
-
-        /* Compute exponentials and sum */
+    execute_window_loop(
+        window,
+        [&](const Coordinates &)
         {
-            /* Get max value */
-            const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
-            const auto vec_max = wrapper::svdup_n(max_val);
+            /* Get pointers */
+            const auto in_ptr  = reinterpret_cast<const ScalarType *>(in_it.ptr()) + start_x;
+            const auto out_ptr = reinterpret_cast<ScalarType *>(out_it.ptr()) + start_x;
+            const auto tmp_ptr = reinterpret_cast<float *>(tmp);
 
-            /* Init sum to zero */
-            auto vec_sum_0 = svdup_n_f32(0.f);
-            auto vec_sum_1 = svdup_n_f32(0.f);
-            auto vec_sum_2 = svdup_n_f32(0.f);
-            auto vec_sum_3 = svdup_n_f32(0.f);
+            float sum{};
 
-            /* Loop over row and compute exponentials and sum */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
+            /* Compute exponentials and sum */
             {
-                const auto vec_elements     = svld1(pg, in_ptr + x);
-                const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
+                /* Get max value */
+                const auto max_val = *reinterpret_cast<const ScalarType *>(max_it.ptr());
+                const auto vec_max = wrapper::svdup_n(max_val);
 
-                auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
-                auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
-                auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
-                auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
+                /* Init sum to zero */
+                auto vec_sum_0 = svdup_n_f32(0.f);
+                auto vec_sum_1 = svdup_n_f32(0.f);
+                auto vec_sum_2 = svdup_n_f32(0.f);
+                auto vec_sum_3 = svdup_n_f32(0.f);
 
-                if(is_log)
+                /* Loop over row and compute exponentials and sum */
+                int      x    = 0;
+                svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                svbool_t pg_0 = svunpklo(svunpklo(pg));
+                svbool_t pg_1 = svunpkhi(svunpklo(pg));
+                svbool_t pg_2 = svunpklo(svunpkhi(pg));
+                svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+                do
                 {
-                    vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
-                    vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
-                    vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
-                    vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
-                }
-                else
-                {
-                    vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
-                    vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
-                    vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
-                    vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
-                    vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
-                    vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
-                    vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
-                    vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
-                }
+                    const auto vec_elements     = svld1(pg, in_ptr + x);
+                    const auto vec_elements_sub = svreinterpret_u8(svsub_z(pg, vec_max, vec_elements));
 
-                svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
-                svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
-                svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
-                svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
+                    auto vec_elements_flt_0 = svcvt_f32_z(pg_0, svunpklo(svunpklo(vec_elements_sub)));
+                    auto vec_elements_flt_1 = svcvt_f32_z(pg_1, svunpkhi(svunpklo(vec_elements_sub)));
+                    auto vec_elements_flt_2 = svcvt_f32_z(pg_2, svunpklo(svunpkhi(vec_elements_sub)));
+                    auto vec_elements_flt_3 = svcvt_f32_z(pg_3, svunpkhi(svunpkhi(vec_elements_sub)));
 
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
-            }
-            while(svptest_any(all_true_pg, pg));
-
-            /* Reduce sum */
-            const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1), svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
-            sum                = svaddv_f32(all_true_pg, vec_sum);
-
-            /* Run remaining elements */
-            x = 0;
-            if(is_log)
-            {
-                sum = std::log(sum);
-            }
-            else
-            {
-                sum = 256.f / sum;
-            }
-        }
-
-        /* Normalize exponentials */
-        {
-            constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
-            /* Loop over row and compute softmax */
-            int      x    = 0;
-            svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-            svbool_t pg_0 = svunpklo(svunpklo(pg));
-            svbool_t pg_1 = svunpkhi(svunpklo(pg));
-            svbool_t pg_2 = svunpklo(svunpkhi(pg));
-            svbool_t pg_3 = svunpkhi(svunpkhi(pg));
-            do
-            {
-                auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
-                auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
-                auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
-                auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
-
-                svfloat32_t res_0{};
-                svfloat32_t res_1{};
-                svfloat32_t res_2{};
-                svfloat32_t res_3{};
-
-                if(is_log)
-                {
-                    res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-                }
-                else
-                {
-                    res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
-                    res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
-                    res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
-                    res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
-
-                    if(is_qasymm8_signed)
+                    if (is_log)
                     {
-                        const auto offset_vec = svdup_n_f32(128.f);
-                        res_0                 = svsub_z(pg_0, res_0, offset_vec);
-                        res_1                 = svsub_z(pg_1, res_1, offset_vec);
-                        res_2                 = svsub_z(pg_2, res_2, offset_vec);
-                        res_3                 = svsub_z(pg_3, res_3, offset_vec);
+                        vec_elements_flt_0 = svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec);
+                        vec_elements_flt_1 = svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec);
+                        vec_elements_flt_2 = svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec);
+                        vec_elements_flt_3 = svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec);
+                        vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, svexp_f32_z(pg_0, vec_elements_flt_0));
+                        vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, svexp_f32_z(pg_1, vec_elements_flt_1));
+                        vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, svexp_f32_z(pg_2, vec_elements_flt_2));
+                        vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, svexp_f32_z(pg_3, vec_elements_flt_3));
                     }
-                }
+                    else
+                    {
+                        vec_elements_flt_0 = svexp_f32_z(pg_0, svmul_f32_z(pg_0, vec_elements_flt_0, scale_beta_vec));
+                        vec_elements_flt_1 = svexp_f32_z(pg_1, svmul_f32_z(pg_1, vec_elements_flt_1, scale_beta_vec));
+                        vec_elements_flt_2 = svexp_f32_z(pg_2, svmul_f32_z(pg_2, vec_elements_flt_2, scale_beta_vec));
+                        vec_elements_flt_3 = svexp_f32_z(pg_3, svmul_f32_z(pg_3, vec_elements_flt_3, scale_beta_vec));
+                        vec_sum_0          = svadd_f32_m(pg_0, vec_sum_0, vec_elements_flt_0);
+                        vec_sum_1          = svadd_f32_m(pg_1, vec_sum_1, vec_elements_flt_1);
+                        vec_sum_2          = svadd_f32_m(pg_2, vec_sum_2, vec_elements_flt_2);
+                        vec_sum_3          = svadd_f32_m(pg_3, vec_sum_3, vec_elements_flt_3);
+                    }
 
-                // Store value
-                const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
-                svst1(pg, out_ptr + x, out);
-                x += wrapper::svcnt<ScalarType>();
-                pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
-                pg_0 = svunpklo(svunpklo(pg));
-                pg_1 = svunpkhi(svunpklo(pg));
-                pg_2 = svunpklo(svunpkhi(pg));
-                pg_3 = svunpkhi(svunpkhi(pg));
+                    svst1_f32(pg_0, tmp_ptr + x, vec_elements_flt_0);
+                    svst1_f32(pg_1, tmp_ptr + x + inc_1, vec_elements_flt_1);
+                    svst1_f32(pg_2, tmp_ptr + x + inc_2, vec_elements_flt_2);
+                    svst1_f32(pg_3, tmp_ptr + x + inc_3, vec_elements_flt_3);
+
+                    x += wrapper::svcnt<ScalarType>();
+                    pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    pg_0 = svunpklo(svunpklo(pg));
+                    pg_1 = svunpkhi(svunpklo(pg));
+                    pg_2 = svunpklo(svunpkhi(pg));
+                    pg_3 = svunpkhi(svunpkhi(pg));
+                } while (svptest_any(all_true_pg, pg));
+
+                /* Reduce sum */
+                const auto vec_sum = svadd_f32_z(all_true_pg, svadd_f32_z(all_true_pg, vec_sum_0, vec_sum_1),
+                                                 svadd_f32_z(all_true_pg, vec_sum_2, vec_sum_3));
+                sum                = svaddv_f32(all_true_pg, vec_sum);
+
+                /* Run remaining elements */
+                x = 0;
+                if (is_log)
+                {
+                    sum = std::log(sum);
+                }
+                else
+                {
+                    sum = 256.f / sum;
+                }
             }
-            while(svptest_any(all_true_pg, pg));
-        }
-    },
-    in_it, max_it, out_it);
+
+            /* Normalize exponentials */
+            {
+                constexpr bool is_qasymm8_signed = std::is_same<ScalarType, qasymm8_signed_t>::value;
+                /* Loop over row and compute softmax */
+                int      x    = 0;
+                svbool_t pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                svbool_t pg_0 = svunpklo(svunpklo(pg));
+                svbool_t pg_1 = svunpkhi(svunpklo(pg));
+                svbool_t pg_2 = svunpklo(svunpkhi(pg));
+                svbool_t pg_3 = svunpkhi(svunpkhi(pg));
+                do
+                {
+                    auto vec_in_0 = svld1_f32(pg_0, tmp_ptr + x);
+                    auto vec_in_1 = svld1_f32(pg_1, tmp_ptr + x + inc_1);
+                    auto vec_in_2 = svld1_f32(pg_2, tmp_ptr + x + inc_2);
+                    auto vec_in_3 = svld1_f32(pg_3, tmp_ptr + x + inc_3);
+
+                    svfloat32_t res_0{};
+                    svfloat32_t res_1{};
+                    svfloat32_t res_2{};
+                    svfloat32_t res_3{};
+
+                    if (is_log)
+                    {
+                        res_0 = svsub_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                        res_1 = svsub_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                        res_2 = svsub_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                        res_3 = svsub_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+                    }
+                    else
+                    {
+                        res_0 = svmul_f32_z(pg_0, vec_in_0, svdup_n_f32(sum));
+                        res_1 = svmul_f32_z(pg_1, vec_in_1, svdup_n_f32(sum));
+                        res_2 = svmul_f32_z(pg_2, vec_in_2, svdup_n_f32(sum));
+                        res_3 = svmul_f32_z(pg_3, vec_in_3, svdup_n_f32(sum));
+
+                        if (is_qasymm8_signed)
+                        {
+                            const auto offset_vec = svdup_n_f32(128.f);
+                            res_0                 = svsub_z(pg_0, res_0, offset_vec);
+                            res_1                 = svsub_z(pg_1, res_1, offset_vec);
+                            res_2                 = svsub_z(pg_2, res_2, offset_vec);
+                            res_3                 = svsub_z(pg_3, res_3, offset_vec);
+                        }
+                    }
+
+                    // Store value
+                    const auto out = convert_float_to_int<SVEType>(res_0, res_1, res_2, res_3);
+                    svst1(pg, out_ptr + x, out);
+                    x += wrapper::svcnt<ScalarType>();
+                    pg   = wrapper::svwhilelt<ScalarType>(x, input_width);
+                    pg_0 = svunpklo(svunpklo(pg));
+                    pg_1 = svunpkhi(svunpklo(pg));
+                    pg_2 = svunpklo(svunpkhi(pg));
+                    pg_3 = svunpkhi(svunpkhi(pg));
+                } while (svptest_any(all_true_pg, pg));
+            }
+        },
+        in_it, max_it, out_it);
 }
 
-template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                                 ITensor *out, float beta, bool is_log, const Window &window);
-template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in, const ITensor *max, void *const tmp,
-                                                          ITensor *out, float beta, bool is_log, const Window &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(const ITensor *in,
+                                                                 const ITensor *max,
+                                                                 void *const    tmp,
+                                                                 ITensor       *out,
+                                                                 float          beta,
+                                                                 bool           is_log,
+                                                                 const Window  &window);
+template void sve2_softmax_logits_1d_quantized<qasymm8_t>(const ITensor *in,
+                                                          const ITensor *max,
+                                                          void *const    tmp,
+                                                          ITensor       *out,
+                                                          float          beta,
+                                                          bool           is_log,
+                                                          const Window  &window);
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/sve2/impl.h b/src/cpu/kernels/softmax/generic/sve2/impl.h
index abbcc15..33fcc26 100644
--- a/src/cpu/kernels/softmax/generic/sve2/impl.h
+++ b/src/cpu/kernels/softmax/generic/sve2/impl.h

@@ -31,8 +31,13 @@
 namespace cpu
 {
 template <typename ScalarType>
-void sve2_softmax_logits_1d_quantized(const ITensor *in, const ITensor *max, void *const tmp,
-                                      ITensor *out, float beta, bool is_log, const Window &window);
+void sve2_softmax_logits_1d_quantized(const ITensor *in,
+                                      const ITensor *max,
+                                      void *const    tmp,
+                                      ITensor       *out,
+                                      float          beta,
+                                      bool           is_log,
+                                      const Window  &window);
 } // namespace cpu
 } // namespace arm_compute
 #endif /* SRC_CORE_SVE2_KERNELS_SOFTMAX_IMPL_H */

diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
index 810035e..9562378 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8.cpp

@@ -23,16 +23,22 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve2/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                          ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_softmax(const ITensor *in,
+                          const ITensor *max,
+                          void *const    tmp,
+                          ITensor       *out,
+                          const float    beta,
+                          bool           is_log,
+                          const Window  &window)
 {
     return sve2_softmax_logits_1d_quantized<qasymm8_t>(in, max, tmp, out, beta, is_log, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
index 283b55e..c20462f 100644
--- a/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp
+++ b/src/cpu/kernels/softmax/generic/sve2/qasymm8_signed.cpp

@@ -23,16 +23,22 @@
  */
 
 #include "arm_compute/core/Helpers.h"
+
 #include "src/cpu/kernels/softmax/generic/sve2/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sve2_qasymm8_signed_softmax(const ITensor *in, const ITensor *max, void *const tmp,
-                                 ITensor *out, const float beta, bool is_log, const Window &window)
+void sve2_qasymm8_signed_softmax(const ITensor *in,
+                                 const ITensor *max,
+                                 void *const    tmp,
+                                 ITensor       *out,
+                                 const float    beta,
+                                 bool           is_log,
+                                 const Window  &window)
 {
     return sve2_softmax_logits_1d_quantized<qasymm8_signed_t>(in, max, tmp, out, beta, is_log, window);
 }
-}
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/kernels/softmax/list.h b/src/cpu/kernels/softmax/list.h
index ed3515f..627ce0c 100644
--- a/src/cpu/kernels/softmax/list.h
+++ b/src/cpu/kernels/softmax/list.h

@@ -28,9 +28,9 @@
 {
 namespace cpu
 {
-#define DECLARE_SOFTMAX_KERNEL(func_name)                                  \
-    void func_name(const ITensor *in, const ITensor *max, void *const tmp, \
-                   ITensor *out, const float beta, bool is_log, const Window &window)
+#define DECLARE_SOFTMAX_KERNEL(func_name)                                                                  \
+    void func_name(const ITensor *in, const ITensor *max, void *const tmp, ITensor *out, const float beta, \
+                   bool is_log, const Window &window)
 
 DECLARE_SOFTMAX_KERNEL(neon_fp32_softmax);
 DECLARE_SOFTMAX_KERNEL(neon_fp16_softmax);
@@ -43,8 +43,7 @@
 
 #undef DECLARE_SOFTMAX_KERNEL
 
-#define DECLARE_LOGITS_KERNEL(func_name) \
-    void func_name(const ITensor *in, ITensor *out, const Window &window)
+#define DECLARE_LOGITS_KERNEL(func_name) void func_name(const ITensor *in, ITensor *out, const Window &window)
 
 DECLARE_LOGITS_KERNEL(neon_fp32_logits);
 DECLARE_LOGITS_KERNEL(neon_fp16_logits);

diff --git a/src/cpu/kernels/sub/neon/list.h b/src/cpu/kernels/sub/neon/list.h
index f7e1a04..9f6c922 100644
--- a/src/cpu/kernels/sub/neon/list.h
+++ b/src/cpu/kernels/sub/neon/list.h

@@ -26,14 +26,16 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
+
 #include "src/core/NEON/wrapper/wrapper.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-#define DECLARE_SUB_KERNEL(func_name) \
-    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+#define DECLARE_SUB_KERNEL(func_name)                                                                   \
+    void func_name(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, \
+                   const Window &window)
 
 DECLARE_SUB_KERNEL(sub_qasymm8_neon_fixedpoint);
 DECLARE_SUB_KERNEL(sub_qasymm8_signed_neon_fixedpoint);
@@ -44,7 +46,8 @@
 #undef DECLARE_SUB_KERNEL
 
 template <typename T>
-void sub_same_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_same_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     /** SIMD vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
@@ -68,7 +71,7 @@
     Iterator input2(src1, window.broadcast_if_dimension_le_one(src1->info()->tensor_shape()));
     Iterator output(dst, window);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool     is_broadcast_input_2 = input2_win.x().step() == 0;
         Window         broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -84,41 +87,44 @@
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
-
-            const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
-            const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v) : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
-                if(is_broadcast_input_2)
-                {
-                    res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
-                }
-                wrapper::vstore(output_ptr + x, res);
-            }
+                const auto non_broadcast_input_ptr = reinterpret_cast<const T *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<T *>(output.ptr());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
-                auto       res             = is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
-                if(is_broadcast_input_2)
+                const T    broadcast_value     = *reinterpret_cast<const T *>(broadcast_input.ptr());
+                const auto broadcast_value_vec = wrapper::vdup_n(broadcast_value, ExactTagType{});
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    res = static_cast<T>(-1) * res;
+                    const auto non_broadcast_v = wrapper::vloadq(non_broadcast_input_ptr + x);
+                    auto       res             = is_sat ? wrapper::vqsub(broadcast_value_vec, non_broadcast_v)
+                                                        : wrapper::vsub(broadcast_value_vec, non_broadcast_v);
+                    if (is_broadcast_input_2)
+                    {
+                        res = wrapper::vmul(res, wrapper::vdup_n(static_cast<T>(-1), ExactTagType{}));
+                    }
+                    wrapper::vstore(output_ptr + x, res);
                 }
 
-                *(output_ptr + x) = res;
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto non_broadcast_v = *(non_broadcast_input_ptr + x);
+                    auto       res =
+                        is_sat ? wrapper::sub_sat(broadcast_value, non_broadcast_v) : broadcast_value - non_broadcast_v;
+                    if (is_broadcast_input_2)
+                    {
+                        res = static_cast<T>(-1) * res;
+                    }
+
+                    *(output_ptr + x) = res;
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -131,31 +137,32 @@
         Iterator output(dst, win);
 
         execute_window_loop(
-            win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<T *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+            win,
+            [&](const Coordinates &)
             {
-                const auto val1 = wrapper::vloadq(input1_ptr + x);
-                const auto val2 = wrapper::vloadq(input2_ptr + x);
-                const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
-                wrapper::vstore(output_ptr + x, res);
-            }
+                const auto input1_ptr = reinterpret_cast<const T *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const T *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<T *>(output.ptr());
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const auto val1   = *(input1_ptr + x);
-                const auto val2   = *(input2_ptr + x);
-                *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
-            }
-        },
-        input1, input2, output);
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
+                {
+                    const auto val1 = wrapper::vloadq(input1_ptr + x);
+                    const auto val2 = wrapper::vloadq(input2_ptr + x);
+                    const auto res  = is_sat ? wrapper::vqsub(val1, val2) : wrapper::vsub(val1, val2);
+                    wrapper::vstore(output_ptr + x, res);
+                }
+
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const auto val1   = *(input1_ptr + x);
+                    const auto val2   = *(input2_ptr + x);
+                    *(output_ptr + x) = is_sat ? wrapper::sub_sat(val1, val2) : val1 - val2;
+                }
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu

diff --git a/src/cpu/kernels/sub/neon/qasymm8.cpp b/src/cpu/kernels/sub/neon/qasymm8.cpp
index ea6e582..b750afc 100644
--- a/src/cpu/kernels/sub/neon/qasymm8.cpp
+++ b/src/cpu/kernels/sub/neon/qasymm8.cpp

@@ -23,21 +23,24 @@
  */
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sub_qasymm8_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_q8_neon_fixedpoint<uint8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
-void sub_qasymm8_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_neon(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
index a86c7f2..fb0bb62 100644
--- a/src/cpu/kernels/sub/neon/qasymm8_signed.cpp
+++ b/src/cpu/kernels/sub/neon/qasymm8_signed.cpp

@@ -24,21 +24,24 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/kernels/add/generic/neon/impl.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sub_qasymm8_signed_neon_fixedpoint(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_signed_neon_fixedpoint(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_q8_neon_fixedpoint<int8_t>(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
-void sub_qasymm8_signed_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qasymm8_signed_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     add_sub_qasymm8_signed_neon(src0, src1, dst, policy, window, false /*is_addition*/);
 }
 
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/kernels/sub/neon/qsymm16.cpp b/src/cpu/kernels/sub/neon/qsymm16.cpp
index 4dfdc0e..23e4b03 100644
--- a/src/cpu/kernels/sub/neon/qsymm16.cpp
+++ b/src/cpu/kernels/sub/neon/qsymm16.cpp

@@ -25,14 +25,16 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/Traits.h"
-#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void sub_qsymm16_neon(const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
+void sub_qsymm16_neon(
+    const ITensor *src0, const ITensor *src1, ITensor *dst, const ConvertPolicy &policy, const Window &window)
 {
     ARM_COMPUTE_UNUSED(policy);
 
@@ -57,7 +59,7 @@
     const float32x4_t vscale2    = vdupq_n_f32(iq2_info.scale);
     const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
 
-    if(is_broadcast_across_x)
+    if (is_broadcast_across_x)
     {
         const bool                    is_broadcast_input_2 = input2_win.x().step() == 0;
         Window                        broadcast_win        = is_broadcast_input_2 ? input2_win : input1_win;
@@ -65,7 +67,7 @@
         const ITensor                *broadcast_tensor     = is_broadcast_input_2 ? src1 : src0;
         const ITensor                *non_broadcast_tensor = !is_broadcast_input_2 ? src1 : src0;
         const UniformQuantizationInfo broadcast_qinfo      = broadcast_tensor->info()->quantization_info().uniform();
-        const UniformQuantizationInfo non_broadcast_qinfo  = non_broadcast_tensor->info()->quantization_info().uniform();
+        const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
 
         // Clear X Dimension on execution window as we handle manually
         non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -74,61 +76,62 @@
         Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
-            const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
-
-            const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
-            const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
-
-            const float32x4x2_t bf =
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                {
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
-                    vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
-                }
-            };
-            const float bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+                const auto non_broadcast_input_ptr = reinterpret_cast<const int16_t *>(non_broadcast_input.ptr());
+                const auto output_ptr              = reinterpret_cast<int16_t *>(output.ptr());
 
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
-            {
-                const int16x8_t     a = vld1q_s16(non_broadcast_input_ptr + x);
-                const float32x4x2_t af =
+                const int16_t   broadcast_value     = *reinterpret_cast<const int16_t *>(broadcast_input.ptr());
+                const int16x8_t broadcast_value_vec = vdupq_n_s16(broadcast_value);
+
+                const float32x4x2_t bf  = {{
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(broadcast_value_vec))), vscale2),
+                     vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(broadcast_value_vec))), vscale2),
+                }};
+                const float         bfs = static_cast<int32_t>(broadcast_value) * broadcast_qinfo.scale;
+
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const int16x8_t     a  = vld1q_s16(non_broadcast_input_ptr + x);
+                    const float32x4x2_t af = {{
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
+                    }};
 
-                const int32x4x4_t rf =
-                {
-                    {
+                    const int32x4x4_t rf = {{
 #ifdef __aarch64__
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+                                                                      : vsubq_f32(af.val[0], bf.val[0]),
+                                                 invvscaleo)),
+                        vcvtnq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+                                                                      : vsubq_f32(af.val[1], bf.val[1]),
+                                                 invvscaleo)),
 #else  //__aarch64__
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
-                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0])
+                                                                     : vsubq_f32(af.val[0], bf.val[0]),
+                                                invvscaleo)),
+                        vcvtq_s32_f32(vmulq_f32(is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1])
+                                                                     : vsubq_f32(af.val[1], bf.val[1]),
+                                                invvscaleo)),
 #endif //__aarch64__
-                    }
-                };
+                    }};
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                    vst1q_s16(output_ptr + x, pa);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
-                *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
-            }
-        },
-        broadcast_input, non_broadcast_input, output);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x)) * non_broadcast_qinfo.scale;
+                    *(output_ptr + x) = quantize_qsymm16(is_broadcast_input_2 ? (bfs - afs) : (afs - bfs), oq_info);
+                }
+            },
+            broadcast_input, non_broadcast_input, output);
     }
     else
     {
@@ -140,38 +143,32 @@
         Iterator input2(src1, input2_win);
         Iterator output(dst, win);
 
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
-            const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
-            const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
-
-            // Compute S elements per iteration
-            int x = window_start_x;
-            for(; x <= (window_end_x - window_step_x); x += window_step_x)
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
             {
-                const int16x8_t a = vld1q_s16(input1_ptr + x);
-                const int16x8_t b = vld1q_s16(input2_ptr + x);
+                const auto input1_ptr = reinterpret_cast<const int16_t *>(input1.ptr());
+                const auto input2_ptr = reinterpret_cast<const int16_t *>(input2.ptr());
+                const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
 
-                const float32x4x2_t af =
+                // Compute S elements per iteration
+                int x = window_start_x;
+                for (; x <= (window_end_x - window_step_x); x += window_step_x)
                 {
-                    {
+                    const int16x8_t a = vld1q_s16(input1_ptr + x);
+                    const int16x8_t b = vld1q_s16(input2_ptr + x);
+
+                    const float32x4x2_t af = {{
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(a))), vscale1),
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(a))), vscale1),
-                    }
-                };
+                    }};
 
-                const float32x4x2_t bf =
-                {
-                    {
+                    const float32x4x2_t bf = {{
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_low_s16(b))), vscale2),
                         vmulq_f32(vcvtq_f32_s32(vmovl_s16(vget_high_s16(b))), vscale2),
-                    }
-                };
+                    }};
 
-                const int32x4x2_t rf =
-                {
-                    {
+                    const int32x4x2_t rf = {{
 #ifdef __aarch64__
                         vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
                         vcvtnq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
@@ -179,23 +176,22 @@
                         vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
                         vcvtq_s32_f32(vmulq_f32(vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
 #endif //__aarch64__
-                    }
-                };
+                    }};
 
-                const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
-                vst1q_s16(output_ptr + x, pa);
-            }
+                    const int16x8_t pa = vcombine_s16(vqmovn_s32(rf.val[0]), vqmovn_s32(rf.val[1]));
+                    vst1q_s16(output_ptr + x, pa);
+                }
 
-            // Compute left-over elements
-            for(; x < window_end_x; ++x)
-            {
-                const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
-                const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
-                *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
-            }
-        },
-        input1, input2, output);
+                // Compute left-over elements
+                for (; x < window_end_x; ++x)
+                {
+                    const float afs   = static_cast<int32_t>((*(input1_ptr + x))) * iq1_info.scale;
+                    const float bfs   = static_cast<int32_t>((*(input2_ptr + x))) * iq2_info.scale;
+                    *(output_ptr + x) = quantize_qsymm16((afs - bfs), dst->info()->quantization_info());
+                }
+            },
+            input1, input2, output);
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/operators/CpuActivation.cpp b/src/cpu/operators/CpuActivation.cpp
index 197e985..44d70cf 100644
--- a/src/cpu/operators/CpuActivation.cpp
+++ b/src/cpu/operators/CpuActivation.cpp

@@ -24,6 +24,7 @@
 #include "src/cpu/operators/CpuActivation.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
 #include "src/common/utils/Log.h"
@@ -42,7 +43,8 @@
     _kernel = std::move(k);
 }
 
-Status CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
+Status
+CpuActivation::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &activation_info)
 {
     return kernels::CpuActivationKernel::validate(input, output, activation_info);
 }
@@ -54,13 +56,17 @@
     NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 }
 
-std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+std::tuple<IOperator *, StatusCode> CpuContext::create_activation(const AclTensorDescriptor     &src,
+                                                                  const AclTensorDescriptor     &dst,
+                                                                  const AclActivationDescriptor &act,
+                                                                  bool                           is_validate)
 {
     TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
     TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
     auto       info     = detail::convert_to_activation_info(act);
 
-    if(is_validate && !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+    if (is_validate &&
+        !bool(CpuActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
     {
         return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
     }
@@ -69,7 +75,7 @@
     act_op->configure(&src_info, &dst_info, info);
 
     auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
-    if(op == nullptr)
+    if (op == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return std::make_tuple(nullptr, StatusCode::OutOfMemory);

diff --git a/src/cpu/operators/CpuActivation.h b/src/cpu/operators/CpuActivation.h
index e21fc7d..ec442f9 100644
--- a/src/cpu/operators/CpuActivation.h
+++ b/src/cpu/operators/CpuActivation.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute

diff --git a/src/cpu/operators/CpuAdd.cpp b/src/cpu/operators/CpuAdd.cpp
index 41def8e..53cd7fa 100644
--- a/src/cpu/operators/CpuAdd.cpp
+++ b/src/cpu/operators/CpuAdd.cpp

@@ -23,17 +23,20 @@
  */
 #include "src/cpu/operators/CpuAdd.h"
 
-#include "src/cpu/kernels/CpuAddKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuAddKernel.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuAdd::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAdd::configure(const ITensorInfo         *src0,
+                       const ITensorInfo         *src1,
+                       ITensorInfo               *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy, act_info);
@@ -42,7 +45,11 @@
     _kernel = std::move(k);
 }
 
-Status CpuAdd::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAdd::validate(const ITensorInfo         *src0,
+                        const ITensorInfo         *src1,
+                        const ITensorInfo         *dst,
+                        ConvertPolicy              policy,
+                        const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuAddKernel::validate(src0, src1, dst, policy);

diff --git a/src/cpu/operators/CpuAdd.h b/src/cpu/operators/CpuAdd.h
index db05c10..5f60102 100644
--- a/src/cpu/operators/CpuAdd.h
+++ b/src/cpu/operators/CpuAdd.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ADD_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -55,14 +56,22 @@
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      *
      */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensorInfo         *src0,
+                   const ITensorInfo         *src1,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuAdd::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src0,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;

diff --git a/src/cpu/operators/CpuAddMulAdd.cpp b/src/cpu/operators/CpuAddMulAdd.cpp
index 590ee48..2f19f2f 100644
--- a/src/cpu/operators/CpuAddMulAdd.cpp
+++ b/src/cpu/operators/CpuAddMulAdd.cpp

@@ -21,39 +21,49 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include "src/cpu/operators/CpuAddMulAdd.h"
+
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/kernels/CpuAddMulAddKernel.h"
-#include "src/cpu/operators/CpuAddMulAdd.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuAddMulAdd::configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                             const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                             ITensorInfo *add_output, ITensorInfo *final_output,
-                             ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuAddMulAdd::configure(const ITensorInfo         *input1,
+                             const ITensorInfo         *input2,
+                             const ITensorInfo         *bn_mul,
+                             const ITensorInfo         *bn_add,
+                             ITensorInfo               *add_output,
+                             ITensorInfo               *final_output,
+                             ConvertPolicy              policy,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 
     auto k = std::make_unique<kernels::CpuAddMulAddKernel>();
 
     const DataType data_type = input1->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         _dequantize_bn_mul.configure(bn_mul, &_dequantized_bn_mul);
         _dequantize_bn_add.configure(bn_add, &_dequantized_bn_add);
 
-        k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy, act_info);
+        k->configure(input1, input2, &_dequantized_bn_mul, &_dequantized_bn_add, add_output, final_output, policy,
+                     act_info);
 
         // Save auxilary memory requirements after configuration
-        _aux_mem[DequantizedBnMul] = experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary, _dequantized_bn_mul.total_size());
-        _aux_mem[DequantizedBnAdd] = experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary, _dequantized_bn_add.total_size());
+        _aux_mem[DequantizedBnMul] =
+            experimental::MemoryInfo(offset_int_vec(DequantizedBnMul), experimental::MemoryLifetime::Temporary,
+                                     _dequantized_bn_mul.total_size());
+        _aux_mem[DequantizedBnAdd] =
+            experimental::MemoryInfo(offset_int_vec(DequantizedBnAdd), experimental::MemoryLifetime::Temporary,
+                                     _dequantized_bn_add.total_size());
     }
     else
     {
@@ -63,13 +73,17 @@
     _kernel = std::move(k);
 }
 
-Status CpuAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                              const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                              const ITensorInfo *add_output, const ITensorInfo *final_output,
-                              ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuAddMulAdd::validate(const ITensorInfo         *input1,
+                              const ITensorInfo         *input2,
+                              const ITensorInfo         *bn_mul,
+                              const ITensorInfo         *bn_add,
+                              const ITensorInfo         *add_output,
+                              const ITensorInfo         *final_output,
+                              ConvertPolicy              policy,
+                              const ActivationLayerInfo &act_info)
 {
     const DataType data_type = input1->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         TensorInfo dequantized_bn_mul = bn_mul->clone()->set_data_type(DataType::F32);
         TensorInfo dequantized_bn_add = bn_add->clone()->set_data_type(DataType::F32);
@@ -77,11 +91,13 @@
         ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_mul, &dequantized_bn_mul));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuDequantize::validate(bn_add, &dequantized_bn_add));
 
-        return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add, add_output, final_output, policy, act_info);
+        return kernels::CpuAddMulAddKernel::validate(input1, input2, &dequantized_bn_mul, &dequantized_bn_add,
+                                                     add_output, final_output, policy, act_info);
     }
     else
     {
-        return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
+        return kernels::CpuAddMulAddKernel::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy,
+                                                     act_info);
     }
 }
 
@@ -89,37 +105,32 @@
 {
     const DataType data_type = tensors.get_const_tensor(TensorType::ACL_SRC_0)->info()->data_type();
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const ITensor *bn_mul = tensors.get_const_tensor(TensorType::ACL_SRC_2);
         const ITensor *bn_add = tensors.get_const_tensor(TensorType::ACL_SRC_3);
 
-        CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors, true);
-        CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors, true);
+        CpuAuxTensorHandler dequantized_bn_mul_handler(offset_int_vec(DequantizedBnMul), _dequantized_bn_mul, tensors,
+                                                       true);
+        CpuAuxTensorHandler dequantized_bn_add_handler(offset_int_vec(DequantizedBnAdd), _dequantized_bn_add, tensors,
+                                                       true);
 
-        ITensorPack dequantize_mul_pack =
-        {
-            { TensorType::ACL_SRC_0, bn_mul },
-            { TensorType::ACL_DST_0, dequantized_bn_mul_handler.get() }
-        };
+        ITensorPack dequantize_mul_pack = {{TensorType::ACL_SRC_0, bn_mul},
+                                           {TensorType::ACL_DST_0, dequantized_bn_mul_handler.get()}};
 
-        ITensorPack dequantize_add_pack =
-        {
-            { TensorType::ACL_SRC_0, bn_add },
-            { TensorType::ACL_DST_0, dequantized_bn_add_handler.get() }
-        };
+        ITensorPack dequantize_add_pack = {{TensorType::ACL_SRC_0, bn_add},
+                                           {TensorType::ACL_DST_0, dequantized_bn_add_handler.get()}};
 
         _dequantize_bn_mul.run(dequantize_mul_pack);
         _dequantize_bn_add.run(dequantize_add_pack);
 
-        ITensorPack add_mul_add_pack =
-        {
-            { TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0) },
-            { TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1) },
-            { TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get() },
-            { TensorType::ACL_SRC_3, dequantized_bn_add_handler.get() },
-            { TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0) },
-            { TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1) },
+        ITensorPack add_mul_add_pack = {
+            {TensorType::ACL_SRC_0, tensors.get_const_tensor(TensorType::ACL_SRC_0)},
+            {TensorType::ACL_SRC_1, tensors.get_const_tensor(TensorType::ACL_SRC_1)},
+            {TensorType::ACL_SRC_2, dequantized_bn_mul_handler.get()},
+            {TensorType::ACL_SRC_3, dequantized_bn_add_handler.get()},
+            {TensorType::ACL_DST_0, tensors.get_tensor(TensorType::ACL_DST_0)},
+            {TensorType::ACL_DST_1, tensors.get_tensor(TensorType::ACL_DST_1)},
         };
 
         NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), add_mul_add_pack);

diff --git a/src/cpu/operators/CpuAddMulAdd.h b/src/cpu/operators/CpuAddMulAdd.h
index cf1ece6..47db75c 100644
--- a/src/cpu/operators/CpuAddMulAdd.h
+++ b/src/cpu/operators/CpuAddMulAdd.h

@@ -42,20 +42,28 @@
      * Similar to @ref NEAddMulAdd::configure()
      *
      */
-    void configure(const ITensorInfo *input1, const ITensorInfo *input2,
-                   const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                   ITensorInfo *add_output, ITensorInfo *final_output,
-                   ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    void configure(const ITensorInfo         *input1,
+                   const ITensorInfo         *input2,
+                   const ITensorInfo         *bn_mul,
+                   const ITensorInfo         *bn_add,
+                   ITensorInfo               *add_output,
+                   ITensorInfo               *final_output,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuAddMulAdd::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input1, const ITensorInfo *input2,
-                           const ITensorInfo *bn_mul, const ITensorInfo *bn_add,
-                           const ITensorInfo *add_output, const ITensorInfo *final_output,
-                           ConvertPolicy policy, const ActivationLayerInfo &act_info);
+    static Status validate(const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *bn_mul,
+                           const ITensorInfo         *bn_add,
+                           const ITensorInfo         *add_output,
+                           const ITensorInfo         *final_output,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
@@ -77,7 +85,7 @@
     TensorInfo _dequantized_bn_mul{};
     TensorInfo _dequantized_bn_add{};
 
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuCast.cpp b/src/cpu/operators/CpuCast.cpp
index 1cfd8c1..55b9204 100644
--- a/src/cpu/operators/CpuCast.cpp
+++ b/src/cpu/operators/CpuCast.cpp

@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuCast.h"
 
-#include "src/cpu/kernels/CpuCastKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCastKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/cpu/operators/CpuConcatenate.cpp b/src/cpu/operators/CpuConcatenate.cpp
index 4021fd8..5f517a8 100644
--- a/src/cpu/operators/CpuConcatenate.cpp
+++ b/src/cpu/operators/CpuConcatenate.cpp

@@ -23,21 +23,20 @@
  */
 #include "src/cpu/operators/CpuConcatenate.h"
 
-#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
-#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
-#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
-#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/Validate.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/kernels/CpuConcatenateBatchKernel.h"
+#include "src/cpu/kernels/CpuConcatenateDepthKernel.h"
+#include "src/cpu/kernels/CpuConcatenateHeightKernel.h"
+#include "src/cpu/kernels/CpuConcatenateWidthKernel.h"
 
 namespace arm_compute
 {
@@ -59,9 +58,9 @@
 
     unsigned int offset = 0;
 
-    for(unsigned int i = 0; i < _num_srcs; ++i)
+    for (unsigned int i = 0; i < _num_srcs; ++i)
     {
-        switch(axis)
+        switch (axis)
         {
             case Window::DimX:
             {
@@ -98,16 +97,17 @@
     }
 }
 
-Status CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
+Status
+CpuConcatenate::validate(const std::vector<const ITensorInfo *> &srcs_vector, const ITensorInfo *dst, size_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON(srcs_vector.size() < 2);
 
     unsigned int offset = 0;
-    for(const auto &src : srcs_vector)
+    for (const auto &src : srcs_vector)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
-        switch(axis)
+        switch (axis)
         {
             case Window::DimX:
             {
@@ -135,7 +135,7 @@
         offset += src->dimension(axis);
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(srcs_vector, axis);
         ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
@@ -146,18 +146,18 @@
 
 void CpuConcatenate::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    if(static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
+    if (static_cast<int>(tensors.size() - 1) != static_cast<int>(_num_srcs))
     {
         ARM_COMPUTE_ERROR("Configured with different number of inputs");
     }
 
     int i = 0;
-    for(auto &k : _concat_kernels)
+    for (auto &k : _concat_kernels)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));

diff --git a/src/cpu/operators/CpuConcatenate.h b/src/cpu/operators/CpuConcatenate.h
index eb11926..c36977c 100644
--- a/src/cpu/operators/CpuConcatenate.h
+++ b/src/cpu/operators/CpuConcatenate.h

@@ -68,8 +68,8 @@
 
 private:
     std::vector<std::unique_ptr<ICPPKernel>> _concat_kernels{};
-    unsigned int                             _num_srcs{ 0 };
-    unsigned int                             _axis{ 0 };
+    unsigned int                             _num_srcs{0};
+    unsigned int                             _axis{0};
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuConv2d.cpp b/src/cpu/operators/CpuConv2d.cpp
index 16ac16b..1931173 100644
--- a/src/cpu/operators/CpuConv2d.cpp
+++ b/src/cpu/operators/CpuConv2d.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuConv2d.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuDirectConv2d.h"
 #include "src/cpu/operators/CpuGemm.h"
@@ -35,26 +37,35 @@
 {
 namespace cpu
 {
-CpuConv2d::CpuConv2d()
-    : _function()
+CpuConv2d::CpuConv2d() : _function()
 {
 }
 
 CpuConv2d::~CpuConv2d() = default;
 
-void CpuConv2d::configure(ITensorInfo *input, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                          const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuConv2d::configure(ITensorInfo               *input,
+                          ITensorInfo               *weights,
+                          const ITensorInfo         *biases,
+                          ITensorInfo               *output,
+                          const PadStrideInfo       &conv_info,
+                          const WeightsInfo         &weights_info,
+                          const Size2D              &dilation,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math,
+                          unsigned int               num_groups)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
-                                                   enable_fast_math, num_groups));
+    ARM_COMPUTE_ERROR_THROW_ON(CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation,
+                                                   act_info, enable_fast_math, num_groups));
 
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                              enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         {
@@ -92,19 +103,30 @@
     _aux_mem = _function->workspace();
 }
 
-Status CpuConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuConv2d::validate(const ITensorInfo         *input,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info,
+                           const Size2D              &dilation,
+                           const ActivationLayerInfo &act_info,
+                           bool                       enable_fast_math,
+                           unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1), "Grouping (num_groups != 1) is not supported on Neon");
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                              enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM:
-            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info,
+                                                                dilation, act_info, enable_fast_math));
             break;
         case ConvolutionMethod::GEMM_CONV2D:
             ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmDirectConv2d::validate(input, weights, biases, output, info));
@@ -120,9 +142,14 @@
     return Status{};
 }
 
-ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
-                                                    const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod CpuConv2d::get_convolution_method(const ITensorInfo         *input,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *output,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info,
+                                                    const Size2D              &dilation,
+                                                    const ActivationLayerInfo &act_info,
+                                                    bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
     ARM_COMPUTE_UNUSED(weights_info);
@@ -137,35 +164,46 @@
     using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo>;
     using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
 
-    const std::vector<ConfigurationMethod> known_configs =
-    {
+    const std::vector<ConfigurationMethod> known_configs = {
         // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U)), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U)),
+                            ConvolutionMethod::GEMM),
         // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U)),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM),
+        ConfigurationMethod(
+            ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)), ConvolutionMethod::GEMM)
-    };
+        ConfigurationMethod(
+            ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                     PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR)),
+            ConvolutionMethod::GEMM)};
 
     const auto find_config = [&](ConfigurationMethod c)
     {
         const ConvolutionConfiguration config = c.first;
         const PadStrideInfo            info   = std::get<3>(config);
 
-        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+        return std::get<0>(config) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride();
     };
 
     std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
     {
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U))
+    if (dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
@@ -173,43 +211,49 @@
     {
         // SRGAN
         // Output might not be initialized when it is an internal tensor of the layer using the convolution
-        if(input->total_size() > 1e7 && (weights->dimension(idx_h) > 7)
-           && (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
+        if (input->total_size() > 1e7 && (weights->dimension(idx_h) > 7) &&
+            (CpuDirectConv2d::validate(input, weights, nullptr, output, conv_info, act_info)))
         {
             return ConvolutionMethod::DIRECT;
         }
-        if(input->dimension(idx_c) < 16)
+        if (input->dimension(idx_c) < 16)
         {
             return ConvolutionMethod::GEMM;
         }
 
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         // This heuristics only applies to F16 data type on A55r1
-        if(NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math && input->data_type() == DataType::F16)
+        if (NEScheduler::get().cpu_info().get_cpu_model() == CPUModel::A55r1 && enable_fast_math &&
+            input->data_type() == DataType::F16)
         {
             // Exclude known bad winograd configs (and defaults to GEMM)
-            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs =
-            {
+            const std::vector<ConvolutionConfiguration> known_bad_winograd_f16_with_fastmath_configs = {
                 // Squeezenet_V1_1 fire2 and fire3
-                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                ConvolutionConfiguration(Size2D(56U, 56U), Size2D(3U, 3U), Size2D(16U, 64U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
                 // Squeezenet_V1_1 fire6 and fire7
-                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(48U, 192U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
                 // Squeezenet_V1_1 fire8 and fire9
-                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U), PadStrideInfo(1U, 1U, 1U, 1U)),
+                ConvolutionConfiguration(Size2D(14U, 14U), Size2D(3U, 3U), Size2D(64U, 256U),
+                                         PadStrideInfo(1U, 1U, 1U, 1U)),
             };
             const auto find_conv_config = [&](ConvolutionConfiguration c)
             {
                 const PadStrideInfo info = std::get<3>(c);
 
-                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) && std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-                       && std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-                       && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride();
+                return std::get<0>(c) == Size2D(input->dimension(idx_w), input->dimension(idx_h)) &&
+                       std::get<1>(c) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+                       std::get<2>(c) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+                       info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+                       info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+                       info.stride() == conv_info.stride();
             };
 
-            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(), known_bad_winograd_f16_with_fastmath_configs.end(),
-                                          find_conv_config)
-                             != known_bad_winograd_f16_with_fastmath_configs.end();
-            if(found_bad)
+            bool found_bad = std::find_if(known_bad_winograd_f16_with_fastmath_configs.begin(),
+                                          known_bad_winograd_f16_with_fastmath_configs.end(),
+                                          find_conv_config) != known_bad_winograd_f16_with_fastmath_configs.end();
+            if (found_bad)
             {
                 return ConvolutionMethod::GEMM;
             }
@@ -217,16 +261,16 @@
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
         // For 1x1 convolutions run the default GEMM
-        if(weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
+        if (weights->dimension(idx_w) == 1 && weights->dimension(idx_h) == 1)
         {
             return ConvolutionMethod::GEMM;
         }
 
-        if(bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
+        if (bool(CpuWinogradConv2d::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)))
         {
             return ConvolutionMethod::WINOGRAD;
         }
-        if(bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
+        if (bool(CpuGemmDirectConv2d::validate(input, weights, nullptr, output, info)))
         {
             return ConvolutionMethod::GEMM_CONV2D;
         }

diff --git a/src/cpu/operators/CpuConv2d.h b/src/cpu/operators/CpuConv2d.h
index 0908ac0..71b9e15 100644
--- a/src/cpu/operators/CpuConv2d.h
+++ b/src/cpu/operators/CpuConv2d.h

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -102,17 +103,32 @@
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuConv2d
      *
      * Similar to CpuConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false,
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
     /** Static function to check if given info will return the convolution called by @ref CpuConv2d
      *
      * @param[in] src              Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
@@ -132,11 +148,17 @@
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                                    const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo         *src,
+                                                    const ITensorInfo         *weights,
+                                                    const ITensorInfo         *dst,
+                                                    const PadStrideInfo       &conv_info,
+                                                    const WeightsInfo         &weights_info     = WeightsInfo(),
+                                                    const Size2D              &dilation         = Size2D(1U, 1U),
+                                                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                                                    bool                       enable_fast_math = false);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
index 810ffb1..49e3192 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.cpp

@@ -24,6 +24,7 @@
 #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuConvertFullyConnectedWeightsKernel.h"
 
@@ -31,7 +32,10 @@
 {
 namespace cpu
 {
-void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+void CpuConvertFullyConnectedWeights::configure(const ITensorInfo *src,
+                                                ITensorInfo       *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
     auto k = std::make_unique<kernels::CpuConvertFullyConnectedWeightsKernel>();
@@ -39,7 +43,10 @@
     _kernel = std::move(k);
 }
 
-Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+Status CpuConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                 const ITensorInfo *dst,
+                                                 const TensorShape &original_src_shape,
+                                                 DataLayout         data_layout)
 {
     return kernels::CpuConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
 }
@@ -48,5 +55,5 @@
 {
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimZ, _kernel->window(), tensors);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuConvertFullyConnectedWeights.h b/src/cpu/operators/CpuConvertFullyConnectedWeights.h
index ea70eee..e208cca 100644
--- a/src/cpu/operators/CpuConvertFullyConnectedWeights.h
+++ b/src/cpu/operators/CpuConvertFullyConnectedWeights.h

@@ -41,14 +41,18 @@
      * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in]  data_layout        The data layout the weights have been trained in.
      */
-    void configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void
+    configure(const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuConvertFullyConnectedWeights::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 };

diff --git a/src/cpu/operators/CpuCopy.cpp b/src/cpu/operators/CpuCopy.cpp
index 7420ff6..92c19d4 100644
--- a/src/cpu/operators/CpuCopy.cpp
+++ b/src/cpu/operators/CpuCopy.cpp

@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuCopy.h"
 
-#include "src/cpu/kernels/CpuCopyKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuCopyKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/cpu/operators/CpuDepthwiseConv2d.cpp b/src/cpu/operators/CpuDepthwiseConv2d.cpp
index 884fe5c..54075f2 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2d.cpp

@@ -24,10 +24,11 @@
 #include "src/cpu/operators/CpuDepthwiseConv2d.h"
 
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
 
@@ -37,11 +38,16 @@
 {
 namespace
 {
-Status validate_arguments_optimized(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status validate_arguments_optimized(const ITensorInfo     *src,
+                                    const ITensorInfo     *weights,
+                                    const ITensorInfo     *biases,
+                                    const ITensorInfo     *dst,
+                                    const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    if(!is_data_type_quantized_per_channel(weights->data_type()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    if (!is_data_type_quantized_per_channel(weights->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
@@ -49,14 +55,17 @@
     ARM_COMPUTE_RETURN_ERROR_ON(info.dilation.x() < 1 || info.dilation.y() < 1);
     const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT);
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) > src->dimension(idx_w) + info.pad_stride_info.pad_left() +
-                                info.pad_stride_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) > src->dimension(idx_h) + info.pad_stride_info.pad_top() +
-                                info.pad_stride_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (info.dilation.x() - 1) >
+                                src->dimension(idx_w) + info.pad_stride_info.pad_left() +
+                                    info.pad_stride_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (info.dilation.y() - 1) >
+                                src->dimension(idx_h) + info.pad_stride_info.pad_top() +
+                                    info.pad_stride_info.pad_bottom());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
+        const unsigned int channel_idx =
+            get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(channel_idx));
     }
@@ -64,7 +73,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, biases, dst, info));
 
     // Validate Activation Layer
-    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+    if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }
@@ -80,8 +89,8 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases,
-                                                                             dst, info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
 
     _is_quantized      = is_data_type_quantized_asymmetric(src->data_type());
     _has_bias          = biases != nullptr;
@@ -91,10 +100,11 @@
     _are_weights_const = weights->are_values_constant();
 
     // Configure pipeline
-    _is_activationlayer_enabled = info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
+    _is_activationlayer_enabled =
+        info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info);
 
     _dwc_optimized_func = std::make_unique<CpuDepthwiseConv2dAssemblyDispatch>();
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input   = std::make_unique<cpu::CpuPermute>();
         _permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -128,7 +138,7 @@
     }
 
     // Configure activation
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<cpu::CpuActivation>();
         _activationlayer_function->configure(dst, nullptr, info.act_info);
@@ -155,7 +165,7 @@
     auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
 
     // Permute input
-    if(_permute)
+    if (_permute)
     {
         ITensorPack pack;
         auto        src      = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -166,7 +176,7 @@
     }
 
     // Run assembly function
-    if(_is_nchw)
+    if (_is_nchw)
     {
         auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
         auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -198,7 +208,7 @@
     }
 
     // Permute output
-    if(_is_nchw)
+    if (_is_nchw)
     {
         ITensorPack pack;
         auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -208,7 +218,7 @@
     }
 
     // Run activation
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -221,7 +231,7 @@
 {
     // if weights are not constant then we need to repack so that weights
     // can be updated in-place
-    if(!_are_weights_const)
+    if (!_are_weights_const)
     {
         auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -238,14 +248,14 @@
         return;
     }
 
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
         auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_4);
 
         // Permute weights
-        if(_permute)
+        if (_permute)
         {
             auto permuted_weights = tensors.get_tensor(TensorType::ACL_INT_1);
 
@@ -279,11 +289,15 @@
     }
 }
 
-void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::configure(ITensorInfo           *src,
+                                                              const ITensorInfo     *weights,
+                                                              const ITensorInfo     *biases,
+                                                              ITensorInfo           *dst,
+                                                              const ConvolutionInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases,
-                                                            dst, info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuDepthwiseConv2d::validate(src, weights, (biases == nullptr) ? nullptr : biases, dst, info));
 
     _is_nchw     = src->data_layout() == DataLayout::NCHW;
     _is_prepared = !_is_nchw;
@@ -294,9 +308,10 @@
 
     auto input_perm   = std::make_unique<TensorInfo>();
     auto weights_perm = std::make_unique<TensorInfo>();
-    auto output_perm  = std::make_unique<TensorInfo>(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+    auto output_perm  = std::make_unique<TensorInfo>(
+        dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input   = std::make_unique<cpu::CpuPermute>();
         _permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -315,7 +330,7 @@
     _depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
     _depthwise_conv_kernel->configure(input_to_use, weights_to_use, biases, output_to_use, info);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_output = std::make_unique<cpu::CpuPermute>();
         _permute_output->configure(output_perm.get(), dst, PermutationVector(1U, 2U, 0U));
@@ -324,43 +339,61 @@
 
     //Configure Activation Layer
     _is_activationlayer_enabled = info.act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<cpu::CpuActivation>();
         _activationlayer_function->configure(dst, nullptr, info.act_info);
     }
 }
 
-Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+Status CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::validate(const ITensorInfo     *src,
+                                                               const ITensorInfo     *weights,
+                                                               const ITensorInfo     *biases,
+                                                               const ITensorInfo     *dst,
                                                                const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    if(src->data_layout() == DataLayout::NCHW)
+    if (src->data_layout() == DataLayout::NCHW)
     {
         TensorShape permuted_input_shape   = src->tensor_shape();
         TensorShape permuted_weights_shape = weights->tensor_shape();
-        TensorShape permuted_output_shape  = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
+        TensorShape permuted_output_shape =
+            misc::shape_calculator::compute_depthwise_convolution_shape(*src, *weights, info);
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        const TensorInfo permuted_input   = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC));
-        const TensorInfo permuted_output  = TensorInfo(dst->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NCHW));
+        const TensorInfo permuted_input   = TensorInfo(src->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_input_shape)
+                                                           .set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_weights = TensorInfo(weights->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_weights_shape)
+                                                           .set_data_layout(DataLayout::NHWC));
+        const TensorInfo permuted_output  = TensorInfo(dst->clone()
+                                                           ->set_is_resizable(true)
+                                                           .reset_padding()
+                                                           .set_tensor_shape(permuted_output_shape)
+                                                           .set_data_layout(DataLayout::NCHW));
 
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&permuted_output, dst, PermutationVector(1U, 2U, 0U)));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output, info));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(
+            &permuted_input, &permuted_weights, biases, &permuted_output, info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            cpu::kernels::CpuDepthwiseConv2dNativeKernel::validate(src, weights, biases, dst, info));
     }
 
     // Validate Activation Layer
-    if(info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
+    if (info.act_info.enabled() && !CpuDepthwiseConv2dAssemblyDispatch::is_activation_supported(info.act_info))
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, info.act_info));
     }
@@ -375,7 +408,7 @@
     auto biases  = tensors.get_const_tensor(TensorType::ACL_SRC_2);
     auto dst     = tensors.get_tensor(TensorType::ACL_DST_0);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         prepare(tensors);
         auto src_perm     = tensors.get_tensor(TensorType::ACL_INT_0);
@@ -392,7 +425,8 @@
         pack_depth.add_const_tensor(TensorType::ACL_SRC_1, weights_perm);
         pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
         pack_depth.add_tensor(TensorType::ACL_DST, dst_perm);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+                                       pack_depth);
     }
     else
     {
@@ -401,10 +435,11 @@
         pack_depth.add_tensor(TensorType::ACL_SRC_1, weights);
         pack_depth.add_tensor(TensorType::ACL_SRC_2, biases);
         pack_depth.add_tensor(TensorType::ACL_DST, dst);
-        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(), pack_depth);
+        NEScheduler::get().schedule_op(_depthwise_conv_kernel.get(), Window::DimY, _depthwise_conv_kernel->window(),
+                                       pack_depth);
     }
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         ITensorPack pack;
         auto        dst_perm = tensors.get_tensor(TensorType::ACL_INT_2);
@@ -413,7 +448,7 @@
         _permute_output->run(pack);
     }
 
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -424,7 +459,7 @@
 
 void CpuDepthwiseConv2d::CpuDepthwiseConv2dGeneric::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto weights      = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto weights_perm = tensors.get_tensor(TensorType::ACL_INT_1);
@@ -441,12 +476,17 @@
     }
 }
 
-void CpuDepthwiseConv2d::configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
+void CpuDepthwiseConv2d::configure(ITensorInfo           *src,
+                                   const ITensorInfo     *weights,
+                                   const ITensorInfo     *biases,
+                                   ITensorInfo           *dst,
+                                   const ConvolutionInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
 
-    _depth_conv_func = get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
-    switch(_depth_conv_func)
+    _depth_conv_func =
+        get_depthwiseconvolution_function(src, weights, (biases != nullptr) ? biases : nullptr, dst, info);
+    switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_optimized.configure(src, weights, biases, dst, info);
@@ -459,10 +499,14 @@
     }
 }
 
-Status CpuDepthwiseConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2d::validate(const ITensorInfo     *src,
+                                    const ITensorInfo     *weights,
+                                    const ITensorInfo     *biases,
+                                    const ITensorInfo     *dst,
+                                    const ConvolutionInfo &info)
 {
     DepthwiseConvolutionFunction depth_conv_func = get_depthwiseconvolution_function(src, weights, biases, dst, info);
-    switch(depth_conv_func)
+    switch (depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             return CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info);
@@ -475,10 +519,13 @@
     }
 }
 
-DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+DepthwiseConvolutionFunction CpuDepthwiseConv2d::get_depthwiseconvolution_function(const ITensorInfo     *src,
+                                                                                   const ITensorInfo     *weights,
+                                                                                   const ITensorInfo     *biases,
+                                                                                   const ITensorInfo     *dst,
                                                                                    const ConvolutionInfo &info)
 {
-    if(bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
+    if (bool(CpuDepthwiseConv2dOptimizedInternal::validate(src, weights, biases, dst, info)))
     {
         return DepthwiseConvolutionFunction::OPTIMIZED;
     }
@@ -490,7 +537,7 @@
 
 void CpuDepthwiseConv2d::run(ITensorPack &tensors)
 {
-    switch(_depth_conv_func)
+    switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_optimized.run(tensors);
@@ -505,7 +552,7 @@
 
 void CpuDepthwiseConv2d::prepare(ITensorPack &tensors)
 {
-    switch(_depth_conv_func)
+    switch (_depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _func_optimized.prepare(tensors);

diff --git a/src/cpu/operators/CpuDepthwiseConv2d.h b/src/cpu/operators/CpuDepthwiseConv2d.h
index 3d8719e..7eaa0df 100644
--- a/src/cpu/operators/CpuDepthwiseConv2d.h
+++ b/src/cpu/operators/CpuDepthwiseConv2d.h

@@ -24,8 +24,9 @@
 #ifndef ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
 #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_H
 
-#include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/ITensorInfo.h"
+
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/kernels/CpuDepthwiseConv2dNativeKernel.h"
@@ -56,14 +57,22 @@
      *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
      * @param[in]      info    Depthwise convolution meta-data.
      */
-    void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+    void configure(ITensorInfo           *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *biases,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDepthwiseConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *biases,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
     /** Static function to choose the best depthwise convolution function for @ref CpuDepthwiseConv2d
      *
      * @param[in] src     Source tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32
@@ -76,7 +85,10 @@
      *
      * @return a Depthwise Convolution Function
      */
-    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+    static DepthwiseConvolutionFunction get_depthwiseconvolution_function(const ITensorInfo     *src,
+                                                                          const ITensorInfo     *weights,
+                                                                          const ITensorInfo     *biases,
+                                                                          const ITensorInfo     *dst,
                                                                           const ConvolutionInfo &info);
 
     // Inherited methods overriden:
@@ -118,32 +130,40 @@
          * @param[out]     dst     Destination tensor info. Data type supported: same as @p src.
          * @param[in]      info    Depthwise convolution meta-data.
          */
-        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+        void configure(ITensorInfo           *src,
+                       const ITensorInfo     *weights,
+                       const ITensorInfo     *biases,
+                       ITensorInfo           *dst,
+                       const ConvolutionInfo &info);
         /** Static function to check if given info will lead to a valid configuration
          *
          * Similar to CpuDepthwiseConv2dOptimizedInternal::configure()
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+        static Status validate(const ITensorInfo     *src,
+                               const ITensorInfo     *weights,
+                               const ITensorInfo     *biases,
+                               const ITensorInfo     *dst,
+                               const ConvolutionInfo &info);
 
         // Inherited methods overriden:
         void run(ITensorPack &tensors) override;
         void prepare(ITensorPack &tensors) override;
 
     private:
-        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                         _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                      _activationlayer_function{ nullptr };
-        bool                                                _has_bias{ false };
-        bool                                                _is_quantized{ false };
-        bool                                                _is_nchw{ true };
-        bool                                                _permute{ false };
-        bool                                                _is_activationlayer_enabled{ false };
-        bool                                                _is_prepared{ false };
-        bool                                                _are_weights_const{ true };
+        std::unique_ptr<CpuDepthwiseConv2dAssemblyDispatch> _dwc_optimized_func{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_input{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_weights{nullptr};
+        std::unique_ptr<CpuPermute>                         _permute_output{nullptr};
+        std::unique_ptr<CpuActivation>                      _activationlayer_function{nullptr};
+        bool                                                _has_bias{false};
+        bool                                                _is_quantized{false};
+        bool                                                _is_nchw{true};
+        bool                                                _permute{false};
+        bool                                                _is_activationlayer_enabled{false};
+        bool                                                _is_prepared{false};
+        bool                                                _are_weights_const{true};
     };
 
     /** Basic function to execute a generic depthwise convolution. This function calls the following kernel:
@@ -176,7 +196,11 @@
          *                         Data type supported: Same as @p src, S32 when src is QASYMM8/QASYMM8_SIGNED.
          * @param[in]      info    Depthwise convolution meta-data.
          */
-        void configure(ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info);
+        void configure(ITensorInfo           *src,
+                       const ITensorInfo     *weights,
+                       const ITensorInfo     *biases,
+                       ITensorInfo           *dst,
+                       const ConvolutionInfo &info);
 
         /** Static function to check if given info will lead to a valid configuration
          *
@@ -184,24 +208,28 @@
          *
          * @return a status
          */
-        static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info);
+        static Status validate(const ITensorInfo     *src,
+                               const ITensorInfo     *weights,
+                               const ITensorInfo     *biases,
+                               const ITensorInfo     *dst,
+                               const ConvolutionInfo &info);
 
         // Inherited methods overridden:
         void run(ITensorPack &tensors) override;
         void prepare(ITensorPack &tensors) override;
 
     private:
-        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_input{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_weights{ nullptr };
-        std::unique_ptr<CpuPermute>                              _permute_output{ nullptr };
-        std::unique_ptr<CpuActivation>                           _activationlayer_function{ nullptr };
-        bool                                                     _is_nchw{ true };
-        bool                                                     _is_prepared{ false };
-        bool                                                     _is_activationlayer_enabled{ false };
+        std::unique_ptr<kernels::CpuDepthwiseConv2dNativeKernel> _depthwise_conv_kernel{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_input{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_weights{nullptr};
+        std::unique_ptr<CpuPermute>                              _permute_output{nullptr};
+        std::unique_ptr<CpuActivation>                           _activationlayer_function{nullptr};
+        bool                                                     _is_nchw{true};
+        bool                                                     _is_prepared{false};
+        bool                                                     _is_activationlayer_enabled{false};
     };
 
-    DepthwiseConvolutionFunction        _depth_conv_func{ DepthwiseConvolutionFunction::GENERIC };
+    DepthwiseConvolutionFunction        _depth_conv_func{DepthwiseConvolutionFunction::GENERIC};
     CpuDepthwiseConv2dOptimizedInternal _func_optimized{};
     CpuDepthwiseConv2dGeneric           _func_generic{};
 };

diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
index d078155..8d3741d 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.cpp

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -38,15 +39,14 @@
 {
 struct CpuDepthwiseConv2dAssemblyDispatch::LocalImpl
 {
-    std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{ nullptr };
-    bool                                                              is_prepared{ false };
-    bool                                                              are_weights_const{ true };
+    std::unique_ptr<kernels::CpuDepthwiseConv2dAssemblyWrapperKernel> asm_kernel{nullptr};
+    bool                                                              is_prepared{false};
+    bool                                                              are_weights_const{true};
     experimental::MemoryRequirements                                  mem_req{};
 };
 
 #ifndef DOXYGEN_SKIP_THIS
-CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch()
-    : _pImpl(std::make_unique<LocalImpl>())
+CpuDepthwiseConv2dAssemblyDispatch::CpuDepthwiseConv2dAssemblyDispatch() : _pImpl(std::make_unique<LocalImpl>())
 {
 }
 #endif /* DOXYGEN_SKIP_THIS */
@@ -66,7 +66,7 @@
     _pImpl->are_weights_const      = weights->are_values_constant();
 
     // If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
+    if (!CpuDepthwiseConv2dAssemblyDispatch::validate(src, weights, bias, dst, info))
     {
         return;
     }
@@ -77,12 +77,16 @@
 
     // Compute memory requirements for assembly kernels
     constexpr size_t alignment = 4096;
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment });
-    _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment });
+    _pImpl->mem_req.push_back({TensorType::ACL_INT_0, dwc_wrapper->get_working_size(num_threads), alignment});
+    _pImpl->mem_req.push_back({TensorType::ACL_INT_1, dwc_wrapper->get_storage_size(), alignment});
     _pImpl->asm_kernel = std::move(dwc_wrapper);
 }
 
-Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info)
+Status CpuDepthwiseConv2dAssemblyDispatch::validate(const ITensorInfo     *src,
+                                                    const ITensorInfo     *weights,
+                                                    const ITensorInfo     *bias,
+                                                    const ITensorInfo     *dst,
+                                                    const ConvolutionInfo &info)
 {
     return kernels::CpuDepthwiseConv2dAssemblyWrapperKernel::validate(src, weights, bias, dst, info);
 }
@@ -111,7 +115,7 @@
 {
     const ITensor *weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
 
-    if((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
+    if ((!_pImpl->are_weights_const && weights != nullptr) || !_pImpl->is_prepared)
     {
         // Pack weights and bias
         const ITensor *bias    = tensors.get_const_tensor(TensorType::ACL_SRC_2);
@@ -125,11 +129,12 @@
         const auto weights_padding = weights->info()->padding();
 
         const size_t ld_weights_col = weights_shape[0] + weights_padding.left + weights_padding.right;
-        const size_t ld_weights_row = ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
+        const size_t ld_weights_row =
+            ld_weights_col * (weights_shape[1] + weights_padding.top + weights_padding.bottom);
         _pImpl->asm_kernel->pack_parameters(parameters_ptr, bias_ptr, weights_ptr, ld_weights_col, ld_weights_row);
 
         weights->mark_as_unused();
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             bias->mark_as_unused();
         }

diff --git a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
index f222ab9..f181662 100644
--- a/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h
+++ b/src/cpu/operators/CpuDepthwiseConv2dAssemblyDispatch.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_DEPTHWISE_CONV2D_ASSEMBLY_DISPATCH_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -53,14 +54,22 @@
      * @param[out] dst     Destination tensor info. Data type supported: same as @p src.
      * @param[in]  info    Depthwise convolution meta-data.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const ConvolutionInfo &info);
+    void configure(const ITensorInfo     *src,
+                   const ITensorInfo     *weights,
+                   const ITensorInfo     *bias,
+                   ITensorInfo           *dst,
+                   const ConvolutionInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDepthwiseConv2dAssemblyDispatch::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const ConvolutionInfo &info);
+    static Status validate(const ITensorInfo     *src,
+                           const ITensorInfo     *weights,
+                           const ITensorInfo     *bias,
+                           const ITensorInfo     *dst,
+                           const ConvolutionInfo &info);
     /** Checks if activation is supported by the assembly kernels
      *
      * @param[in] activation Activation to check
@@ -70,8 +79,8 @@
     static bool is_activation_supported(const ActivationLayerInfo &activation);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/cpu/operators/CpuDequantize.cpp b/src/cpu/operators/CpuDequantize.cpp
index 12dc136..c05a23f 100644
--- a/src/cpu/operators/CpuDequantize.cpp
+++ b/src/cpu/operators/CpuDequantize.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuDequantizeKernel.h"
 

diff --git a/src/cpu/operators/CpuDirectConv2d.cpp b/src/cpu/operators/CpuDirectConv2d.cpp
index 9cdbdb6..135a3bb 100644
--- a/src/cpu/operators/CpuDirectConv2d.cpp
+++ b/src/cpu/operators/CpuDirectConv2d.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -36,12 +37,25 @@
 CpuDirectConv2d::~CpuDirectConv2d() = default;
 
 CpuDirectConv2d::CpuDirectConv2d(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _output_stage_kernel(), _conv_kernel(), _input_border_handler(), _activationlayer_function(), _accumulator(), _has_bias(false),
-      _is_activationlayer_enabled(false), _dim_split(Window::DimZ), _is_padding_required()
+    : _memory_group(std::move(memory_manager)),
+      _output_stage_kernel(),
+      _conv_kernel(),
+      _input_border_handler(),
+      _activationlayer_function(),
+      _accumulator(),
+      _has_bias(false),
+      _is_activationlayer_enabled(false),
+      _dim_split(Window::DimZ),
+      _is_padding_required()
 {
 }
 
-void CpuDirectConv2d::configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CpuDirectConv2d::configure(ITensorInfo               *src,
+                                ITensorInfo               *weights,
+                                const ITensorInfo         *bias,
+                                ITensorInfo               *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_LOG_PARAMS(src, weights, bias, dst, conv_info, act_info);
@@ -51,7 +65,7 @@
     _input_border_handler = std::make_unique<NEFillBorderKernel>();
 
     // Free accumulator
-    if(_accumulator.buffer() != nullptr)
+    if (_accumulator.buffer() != nullptr)
     {
         _accumulator.allocator()->free();
     }
@@ -62,28 +76,33 @@
     _has_bias = (bias != nullptr);
 
     _conv_kernel->configure(src, weights, dst, conv_info);
-    if(_has_bias)
+    if (_has_bias)
     {
         _output_stage_kernel->configure(dst, bias);
     }
     _is_padding_required = !_conv_kernel->border_size().empty();
 
-    if(_is_padding_required)
+    if (_is_padding_required)
     {
         // Add zero padding XY
-        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT, PixelValue(static_cast<float>(0.f)));
+        _input_border_handler->configure(src, _conv_kernel->border_size(), BorderMode::CONSTANT,
+                                         PixelValue(static_cast<float>(0.f)));
     }
 
     //Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<CpuActivation>();
         _activationlayer_function->configure(dst, dst, act_info);
     }
 }
 
-Status CpuDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+Status CpuDirectConv2d::validate(const ITensorInfo         *src,
+                                 const ITensorInfo         *weights,
+                                 const ITensorInfo         *bias,
+                                 const ITensorInfo         *dst,
+                                 const PadStrideInfo       &conv_info,
                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
@@ -95,7 +114,7 @@
     // Validate Convolution kernel
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dKernel::validate(src, weights, &accumulator, conv_info));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, bias);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != weights->dimension(3),
@@ -106,7 +125,7 @@
     // Validate bias kernel
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv2dOutputStageKernel::validate(&accumulator, bias, dst));
 
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, act_info));
     }
@@ -122,14 +141,15 @@
     auto bias = tensors.get_const_tensor(TensorType::ACL_SRC_2);
     auto dst  = tensors.get_tensor(TensorType::ACL_DST);
 
-    if(_is_padding_required)
+    if (_is_padding_required)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC_DST, src);
-        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(), pack);
+        NEScheduler::get().schedule_op(_input_border_handler.get(), Window::DimZ, _input_border_handler->window(),
+                                       pack);
     }
     NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
-    if(_has_bias)
+    if (_has_bias)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC_0, dst);
@@ -138,7 +158,7 @@
         NEScheduler::get().schedule_op(_output_stage_kernel.get(), Window::DimY, _output_stage_kernel->window(), pack);
     }
 
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);

diff --git a/src/cpu/operators/CpuDirectConv2d.h b/src/cpu/operators/CpuDirectConv2d.h
index fa8d61e..73c85f2 100644
--- a/src/cpu/operators/CpuDirectConv2d.h
+++ b/src/cpu/operators/CpuDirectConv2d.h

@@ -24,13 +24,14 @@
 #ifndef ARM_COMPUTE_CPU_DIRECTCONV2D_H
 #define ARM_COMPUTE_CPU_DIRECTCONV2D_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
@@ -75,14 +76,23 @@
      * @param[in]      conv_info Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]      act_info  (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(ITensorInfo *src, ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *dst, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   const ITensorInfo         *bias,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -95,10 +105,10 @@
     std::unique_ptr<NEFillBorderKernel>                        _input_border_handler;
     std::unique_ptr<CpuActivation>                             _activationlayer_function;
     Tensor                                                     _accumulator;
-    bool                                                       _has_bias{ false };
-    bool                                                       _is_activationlayer_enabled{ false };
-    unsigned int                                               _dim_split{ 0 };
-    bool                                                       _is_padding_required{ false };
+    bool                                                       _has_bias{false};
+    bool                                                       _is_activationlayer_enabled{false};
+    unsigned int                                               _dim_split{0};
+    bool                                                       _is_padding_required{false};
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuDirectConv3d.cpp b/src/cpu/operators/CpuDirectConv3d.cpp
index aa74e42..626f1c6 100644
--- a/src/cpu/operators/CpuDirectConv3d.cpp
+++ b/src/cpu/operators/CpuDirectConv3d.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -36,11 +37,17 @@
 CpuDirectConv3d::~CpuDirectConv3d() = default;
 
 CpuDirectConv3d::CpuDirectConv3d(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _conv_kernel(), _activationlayer_function(), _accumulator(), _is_activationlayer_enabled(false), _dim_split(Window::DimZ)
+    : _memory_group(std::move(memory_manager)),
+      _conv_kernel(),
+      _activationlayer_function(),
+      _accumulator(),
+      _is_activationlayer_enabled(false),
+      _dim_split(Window::DimZ)
 {
 }
 
-void CpuDirectConv3d::configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
+void CpuDirectConv3d::configure(
+    ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src0, src1, src2, dst, conv_info);
     ARM_COMPUTE_ERROR_ON(src0->data_layout() != DataLayout::NDHWC);
@@ -48,7 +55,7 @@
     _conv_kernel = std::make_unique<kernels::CpuDirectConv3dKernel>();
 
     // Free accumulator
-    if(_accumulator.buffer() != nullptr)
+    if (_accumulator.buffer() != nullptr)
     {
         _accumulator.allocator()->free();
     }
@@ -59,21 +66,25 @@
 
     //Configure Activation Layer
     _is_activationlayer_enabled = conv_info.act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activationlayer_function = std::make_unique<CpuActivation>();
         _activationlayer_function->configure(dst, dst, conv_info.act_info);
     }
 }
 
-Status CpuDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info)
+Status CpuDirectConv3d::validate(const ITensorInfo *src0,
+                                 const ITensorInfo *src1,
+                                 const ITensorInfo *src2,
+                                 const ITensorInfo *dst,
+                                 const Conv3dInfo   conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // Validate Convolution kernel
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuDirectConv3dKernel::validate(src0, src1, src2, dst, conv_info));
 
-    if(conv_info.act_info.enabled())
+    if (conv_info.act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(dst, nullptr, conv_info.act_info));
     }
@@ -89,7 +100,7 @@
 
     NEScheduler::get().schedule_op(_conv_kernel.get(), _dim_split, _conv_kernel->window(), tensors);
 
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         ITensorPack pack;
         pack.add_tensor(TensorType::ACL_SRC, dst);
@@ -98,4 +109,4 @@
     }
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/operators/CpuDirectConv3d.h b/src/cpu/operators/CpuDirectConv3d.h
index cde01f0..3ad1e09 100644
--- a/src/cpu/operators/CpuDirectConv3d.h
+++ b/src/cpu/operators/CpuDirectConv3d.h

@@ -24,14 +24,15 @@
 #ifndef ARM_COMPUTE_CPU_DIRECTCONV3D_H
 #define ARM_COMPUTE_CPU_DIRECTCONV3D_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
@@ -76,14 +77,19 @@
      *                           The 1st dimensions must be equal to the 1st dimension of the @p kernels tensor.
      * @param[in]      conv_info Contains padding, stride, acitvation information.
      */
-    void configure(ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
+    void configure(
+        ITensorInfo *src0, ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo conv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuDirectConv3d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo conv_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo   conv_info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
@@ -93,8 +99,8 @@
     std::unique_ptr<kernels::CpuDirectConv3dKernel> _conv_kernel;
     std::unique_ptr<CpuActivation>                  _activationlayer_function;
     Tensor                                          _accumulator;
-    bool                                            _is_activationlayer_enabled{ false };
-    unsigned int                                    _dim_split{ 0 };
+    bool                                            _is_activationlayer_enabled{false};
+    unsigned int                                    _dim_split{0};
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuElementwise.cpp b/src/cpu/operators/CpuElementwise.cpp
index b88ae3e..c2ae877 100644
--- a/src/cpu/operators/CpuElementwise.cpp
+++ b/src/cpu/operators/CpuElementwise.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuElementwise.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/CpuElementwiseKernel.h"
@@ -33,7 +34,7 @@
 void CpuElementwiseBase::run(ITensorPack &tensors)
 {
     // If the kernel has been configured, use the window from the kernel.
-    if(_kernel->is_window_configured())
+    if (_kernel->is_window_configured())
     {
         ICpuOperator::run(tensors);
         return;
@@ -101,12 +102,16 @@
 }
 
 template <ComparisonOperation COP>
-Status CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
+Status
+CpuElementwiseComparisonStatic<COP>::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst)
 {
     return kernels::CpuComparisonKernel::validate(COP, src0, src1, dst);
 }
 
-void CpuElementwiseComparison::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ComparisonOperation op)
+void CpuElementwiseComparison::configure(const ITensorInfo  *src0,
+                                         const ITensorInfo  *src1,
+                                         ITensorInfo        *dst,
+                                         ComparisonOperation op)
 {
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst);
     auto k = std::make_unique<kernels::CpuComparisonKernel>();
@@ -114,7 +119,10 @@
     _kernel = std::move(k);
 }
 
-Status CpuElementwiseComparison::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op)
+Status CpuElementwiseComparison::validate(const ITensorInfo  *src0,
+                                          const ITensorInfo  *src1,
+                                          const ITensorInfo  *dst,
+                                          ComparisonOperation op)
 {
     return kernels::CpuComparisonKernel::validate(op, src0, src1, dst);
 }
@@ -127,4 +135,4 @@
 template class CpuElementwiseComparisonStatic<ComparisonOperation::Less>;
 template class CpuElementwiseComparisonStatic<ComparisonOperation::LessEqual>;
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/operators/CpuElementwise.h b/src/cpu/operators/CpuElementwise.h
index b6c61cf..5db53c8 100644
--- a/src/cpu/operators/CpuElementwise.h
+++ b/src/cpu/operators/CpuElementwise.h

@@ -139,7 +139,8 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
+    static Status
+    validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ComparisonOperation op);
 };
 
 /** Basic function to run @ref cpu::kernels::CpuComparisonKernel
@@ -182,4 +183,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_H */

diff --git a/src/cpu/operators/CpuElementwiseUnary.cpp b/src/cpu/operators/CpuElementwiseUnary.cpp
index 7fd14db..04ab7bf 100644
--- a/src/cpu/operators/CpuElementwiseUnary.cpp
+++ b/src/cpu/operators/CpuElementwiseUnary.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuElementwiseUnary.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/cpu/kernels/CpuElementwiseUnaryKernel.h"
@@ -47,7 +48,7 @@
 
 void CpuElementwiseUnary::run(ITensorPack &tensors)
 {
-    if(_kernel->is_window_configured())
+    if (_kernel->is_window_configured())
     {
         ICpuOperator::run(tensors);
         return;
@@ -57,4 +58,4 @@
     ICpuOperator::run(tensors, compute_output_shape_and_window(src_info->tensor_shape()).second);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/operators/CpuElementwiseUnary.h b/src/cpu/operators/CpuElementwiseUnary.h
index 5e8e98d..1e51bfa 100644
--- a/src/cpu/operators/CpuElementwiseUnary.h
+++ b/src/cpu/operators/CpuElementwiseUnary.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -56,4 +57,4 @@
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_ELEMENTWISE_UNARY_H */

diff --git a/src/cpu/operators/CpuFill.cpp b/src/cpu/operators/CpuFill.cpp
index 3d8f62f..1890d0b 100644
--- a/src/cpu/operators/CpuFill.cpp
+++ b/src/cpu/operators/CpuFill.cpp

@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuFill.h"
 
-#include "src/cpu/kernels/CpuFillKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFillKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/cpu/operators/CpuFill.h b/src/cpu/operators/CpuFill.h
index 41d9a9f..cb83745 100644
--- a/src/cpu/operators/CpuFill.h
+++ b/src/cpu/operators/CpuFill.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_FILL_H
 
 #include "arm_compute/core/PixelValue.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute

diff --git a/src/cpu/operators/CpuFlatten.cpp b/src/cpu/operators/CpuFlatten.cpp
index 7bab9e4..2609d44 100644
--- a/src/cpu/operators/CpuFlatten.cpp
+++ b/src/cpu/operators/CpuFlatten.cpp

@@ -23,16 +23,14 @@
  */
 #include "src/cpu/operators/CpuFlatten.h"
 
-#include "src/cpu/operators/CpuReshape.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/operators/CpuReshape.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-CpuFlatten::CpuFlatten()
-    : _reshape(nullptr)
+CpuFlatten::CpuFlatten() : _reshape(nullptr)
 {
 }
 

diff --git a/src/cpu/operators/CpuFloor.cpp b/src/cpu/operators/CpuFloor.cpp
index 868add7..a107393 100644
--- a/src/cpu/operators/CpuFloor.cpp
+++ b/src/cpu/operators/CpuFloor.cpp

@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuFloor.h"
 
-#include "src/cpu/kernels/CpuFloorKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuFloorKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/cpu/operators/CpuFullyConnected.cpp b/src/cpu/operators/CpuFullyConnected.cpp
index 395d8d2..85a0b03 100644
--- a/src/cpu/operators/CpuFullyConnected.cpp
+++ b/src/cpu/operators/CpuFullyConnected.cpp

@@ -25,10 +25,11 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensorPack.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
@@ -49,8 +50,11 @@
 
 namespace
 {
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
-                                      GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo         *src,
+                                      const ITensorInfo         *weights,
+                                      const ITensorInfo         *dst,
+                                      const ActivationLayerInfo &act,
+                                      GEMMLowpOutputStageInfo   &gemmlowp_output_stage_info)
 {
     const auto                    data_type = src->data_type();
     const QuantizationInfo        oq_info   = dst->quantization_info();
@@ -62,10 +66,11 @@
     int32_t output_multiplier;
     int32_t output_shift;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
-    int32_t type_min = 0;
-    int32_t type_max = 0;
+    int32_t type_min             = 0;
+    int32_t type_max             = 0;
     std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
 
     gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -78,14 +83,22 @@
     return Status{};
 }
 
-Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act, bool enable_fast_math, WeightFormat weight_format)
+Status validate_mm(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   const ITensorInfo         *dst,
+                   const ActivationLayerInfo &act,
+                   bool                       enable_fast_math,
+                   WeightFormat               weight_format)
 {
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate src and weights offset
-        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
-        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+                                                     -src->quantization_info().uniform().offset);
+        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+                                                         -weights->quantization_info().uniform().offset);
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
         ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(src, weights, dst, act, gemmlowp_output_stage_info));
@@ -97,11 +110,8 @@
         // Validate gemmlowp function
         TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
         TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuGemmLowpMatrixMultiplyCore::validate(&src_info,
-                                                                            &weights_info,
-                                                                            biases,
-                                                                            dst,
-                                                                            gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CpuGemmLowpMatrixMultiplyCore::validate(&src_info, &weights_info, biases, dst, gemm_info));
     }
     else
     {
@@ -142,21 +152,28 @@
 
 CpuFullyConnected::~CpuFullyConnected() = default;
 
-void CpuFullyConnected::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_mm(const ITensorInfo         *src,
+                                     const ITensorInfo         *weights,
+                                     const ITensorInfo         *biases,
+                                     ITensorInfo               *dst,
+                                     const ActivationLayerInfo &act)
 {
-    if(_is_quantized_asymmetric)
+    if (_is_quantized_asymmetric)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate src and weights offset
-        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale, -src->quantization_info().uniform().offset);
-        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale, -weights->quantization_info().uniform().offset);
+        const QuantizationInfo src_quantization_info(src->quantization_info().uniform().scale,
+                                                     -src->quantization_info().uniform().offset);
+        const QuantizationInfo weights_quantization_info(weights->quantization_info().uniform().scale,
+                                                         -weights->quantization_info().uniform().offset);
 
         TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
         TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
 
         // Configure gemmlowp function and output stage for asymmetric quantized types
         GEMMLowpOutputStageInfo gemmlowp_output_stage_info;
-        const Status            status = get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
+        const Status            status =
+            get_gemmlowp_output_stage_info(&src_info, &weights_info, dst, act, gemmlowp_output_stage_info);
         ARM_COMPUTE_ERROR_ON(status.error_code() != ErrorCode::OK);
 
         GEMMInfo gemm_info;
@@ -179,7 +196,11 @@
     }
 }
 
-void CpuFullyConnected::configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_conv_fc(const ITensorInfo         *src,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *biases,
+                                          ITensorInfo               *dst,
+                                          const ActivationLayerInfo &act)
 {
     ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
@@ -195,7 +216,11 @@
     configure_mm(&_flattened_src, weights, biases, dst, act);
 }
 
-void CpuFullyConnected::configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act)
+void CpuFullyConnected::configure_fc_fc(const ITensorInfo         *src,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        ITensorInfo               *dst,
+                                        const ActivationLayerInfo &act)
 {
     ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1));
 
@@ -203,17 +228,17 @@
     configure_mm(src, weights, biases, dst, act);
 }
 
-void CpuFullyConnected::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                                  FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+void CpuFullyConnected::configure(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  ITensorInfo            *dst,
+                                  FullyConnectedLayerInfo fc_info,
+                                  const WeightsInfo      &weights_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuFullyConnected::validate(src,
-                                                           weights,
-                                                           biases != nullptr ? biases : nullptr,
-                                                           dst,
-                                                           fc_info,
-                                                           weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuFullyConnected::validate(src, weights, biases != nullptr ? biases : nullptr, dst, fc_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, fc_info);
 
     _needs_weights_conversion = false;
@@ -238,9 +263,11 @@
 
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -248,7 +275,7 @@
     }
 
     // Reshape weights if needed
-    if(_needs_weights_reshape)
+    if (_needs_weights_reshape)
     {
         // Reshape the weights
         _transpose_weights = std::make_unique<kernels::CpuTransposeKernel>();
@@ -260,13 +287,11 @@
     }
 
     // Convert weights if needed
-    if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Convert weights
         _convert_weights = std::make_unique<CpuConvertFullyConnectedWeights>();
-        _convert_weights->configure(weights_to_use,
-                                    &_converted_weights,
-                                    src->tensor_shape(),
+        _convert_weights->configure(weights_to_use, &_converted_weights, src->tensor_shape(),
                                     fc_info.weights_trained_layout);
         _converted_weights.set_are_values_constant(weights_to_use->are_values_constant());
 
@@ -275,7 +300,7 @@
         _trans_weights_idx        = AuxTensorIdx::ConvertedWeights;
     }
 
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         configure_conv_fc(src, weights_to_use, biases, dst, fc_info.activation_info);
@@ -287,54 +312,57 @@
     }
 
     // Retain the tensorinfo with the weights to use
-    if(_needs_weights_reshape || _needs_weights_conversion)
+    if (_needs_weights_reshape || _needs_weights_conversion)
     {
         _trans_weights = *weights_to_use;
     }
 
     // Set auxiliary memory requirements
     auto gemm_mem_req = (_is_quantized_asymmetric) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
-    for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+    for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
     {
         _aux_mem[i] = gemm_mem_req[i];
     }
 
-    if(_aux_mem[Pretranspose].size > 0)
+    if (_aux_mem[Pretranspose].size > 0)
     {
         // Release permuted weights at the end of prepare as they are further transposed by the assembly dispatch
         // Do not release them if biases are dynamic and data type is quantized, since the weights tensor will be used for biases offset calculation
         // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time.
         _aux_mem[TransposedWeights] = MemoryInfo(
             offset_int_vec(TransposedWeights),
-            _dynamic_weights                                                         ? MemoryLifetime::Temporary  :
-            (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent :
-                                                                                       MemoryLifetime::Prepare,
+            _dynamic_weights                                                           ? MemoryLifetime::Temporary
+            : (_is_quantized_asymmetric && biases && !(biases->are_values_constant())) ? MemoryLifetime::Persistent
+                                                                                       : MemoryLifetime::Prepare,
             _reshaped_weights.total_size());
 
-        _aux_mem[ConvertedWeights] = MemoryInfo(
-            offset_int_vec(ConvertedWeights),
-            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-            _converted_weights.total_size());
+        _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                _converted_weights.total_size());
     }
     else
     {
-        _aux_mem[TransposedWeights] = MemoryInfo(
-            offset_int_vec(TransposedWeights),
-            _dynamic_weights          ? MemoryLifetime::Temporary :
-            _needs_weights_conversion ? MemoryLifetime::Prepare   :
-                                        MemoryLifetime::Persistent,
-            _reshaped_weights.total_size());
+        _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                 _dynamic_weights            ? MemoryLifetime::Temporary
+                                                 : _needs_weights_conversion ? MemoryLifetime::Prepare
+                                                                             : MemoryLifetime::Persistent,
+                                                 _reshaped_weights.total_size());
 
         _aux_mem[ConvertedWeights] = MemoryInfo(
-            offset_int_vec(ConvertedWeights),
-            _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
+            offset_int_vec(ConvertedWeights), _dynamic_weights ? MemoryLifetime::Temporary : MemoryLifetime::Persistent,
             _converted_weights.total_size());
     }
-    _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
 }
 
-Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
-                                       const ITensorInfo *biases, const ITensorInfo *dst, FullyConnectedLayerInfo fc_info, WeightsInfo weights_info)
+Status CpuFullyConnected::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                       const ITensorInfo         *src,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *dst,
+                                       FullyConnectedLayerInfo    fc_info,
+                                       WeightsInfo                weights_info)
 {
     GEMMInfo gemm_info;
     gemm_info.set_activation_info(fc_info.activation_info);
@@ -345,12 +373,17 @@
     return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
 }
 
-Status CpuFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+Status CpuFullyConnected::validate(const ITensorInfo      *src,
+                                   const ITensorInfo      *weights,
+                                   const ITensorInfo      *biases,
+                                   const ITensorInfo      *dst,
+                                   FullyConnectedLayerInfo fc_info,
+                                   const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_UNUSED(fc_info.retain_internal_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     if (is_fixed_format_fast_math(weights_info.weight_format()))
     {
@@ -364,15 +397,22 @@
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
 
     bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true;
     bool is_fc_after_conv = true;
 
-    const ITensorInfo &flatten_src       = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone());
+    const ITensorInfo &flatten_src =
+        TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = weights_reshaped
+                                               ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding())
+                                               : TensorInfo(*reshaped_weights.clone());
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -386,10 +426,10 @@
     // Check if we have a fully connected layer with batches
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -399,36 +439,37 @@
         }
     }
 
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), dst->tensor_shape().cbegin() + 1));
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
     }
     else
     {
         is_fc_after_conv = src->num_dimensions() > 1;
     }
 
-    if(!weights_reshaped)
+    if (!weights_reshaped)
     {
         // Validate reshape weights kernel
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuTransposeKernel::validate(weights, &reshaped_weights));
         weights_to_use = &reshaped_weights;
     }
 
-    if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                              &converted_weights,
-                                                                              src->tensor_shape(),
-                                                                              fc_info.weights_trained_layout));
+        ARM_COMPUTE_RETURN_ON_ERROR(CpuConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
         weights_to_use = &converted_weights;
     }
 
-    if(is_fc_after_conv)
+    if (is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
         // Validate flatten kernel
         ARM_COMPUTE_RETURN_ON_ERROR(CpuFlatten::validate(src, &flatten_src));
@@ -440,7 +481,8 @@
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1));
     }
     // Validate matrix multiply kernel
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info, fc_info.enable_fast_math, weights_info.weight_format()));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(src_to_use, weights_to_use, biases, dst, fc_info.activation_info,
+                                            fc_info.enable_fast_math, weights_info.weight_format()));
 
     return Status{};
 }
@@ -460,21 +502,21 @@
     CpuAuxTensorHandler transformed_wei(offset_int_vec(_trans_weights_idx), _trans_weights, tensors, false);
 
     // Linearize src if it comes from a convolutional layer
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
-        ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
         _flatten->run(flatten_pack);
     }
 
     ITensorPack gemm_pack = tensors;
     gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
-    if(_needs_weights_reshape || _needs_weights_conversion)
+    if (_needs_weights_reshape || _needs_weights_conversion)
     {
         gemm_pack.add_const_tensor(ACL_SRC_1, transformed_wei.get());
     }
 
     // Run matrix multiply
-    if(_is_quantized_asymmetric)
+    if (_is_quantized_asymmetric)
     {
         _mm_gemmlowp->run(gemm_pack);
     }
@@ -486,7 +528,7 @@
 
 void CpuFullyConnected::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared || _dynamic_weights)
+    if (!_is_prepared || _dynamic_weights)
     {
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
         ++_asrt_prepare_count;
@@ -502,20 +544,21 @@
         const ITensor *cur_weights = weights;
 
         // Reshape of the weights (happens only once)
-        if(_needs_weights_reshape)
+        if (_needs_weights_reshape)
         {
             // Run reshape weights kernel and mark weights as unused
-            ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
-            NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(), transpose_pack);
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
+            NEScheduler::get().schedule_op(_transpose_weights.get(), Window::DimY, _transpose_weights->window(),
+                                           transpose_pack);
 
             cur_weights->mark_as_unused();
             cur_weights = reshaped_weights.get();
         }
 
         // Convert weights if needed (happens only once)
-        if(_needs_weights_conversion)
+        if (_needs_weights_conversion)
         {
-            ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
             _convert_weights->run(convert_pack);
 
             cur_weights->mark_as_unused();
@@ -526,7 +569,7 @@
         gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
-        if(!_is_quantized_asymmetric)
+        if (!_is_quantized_asymmetric)
         {
             _mm_gemm->prepare(gemm_pack);
         }

diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h
index 1e8c647..7073fb9 100644
--- a/src/cpu/operators/CpuFullyConnected.h
+++ b/src/cpu/operators/CpuFullyConnected.h

@@ -24,11 +24,11 @@
 #ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H
 #define ARM_COMPUTE_CPU_FULLY_CONNECTED_H
 
-#include "src/cpu/ICpuOperator.h"
-
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/function_info/FullyConnectedLayerInfo.h"
 
+#include "src/cpu/ICpuOperator.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -86,16 +86,24 @@
      * @param[in]  fc_info      (Optional) Fully connected layer additional info
      * @param[in]  weights_info (Optional) Stores neccessary compute information when weights are already reshaped
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                   FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const ITensorInfo      *src,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *dst,
+                   FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CpuFullyConnected
      *
      * Similar to @ref CpuFullyConnected::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo(), const WeightsInfo &weights_info = WeightsInfo());
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
+                           FullyConnectedLayerInfo fc_info      = FullyConnectedLayerInfo(),
+                           const WeightsInfo      &weights_info = WeightsInfo());
 
     /** Static function that queries whether there exists fixed-format kernel and if it exists it will return in the first argument in what format
      * weights are expected to be reshaped as defined by WeightFormat class. Apart from the first argument the rest of the arguments are the same
@@ -103,19 +111,35 @@
      *
      * @return a status
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights,
-                               const ITensorInfo *biases, const ITensorInfo *dst,
-                               FullyConnectedLayerInfo fc_info, WeightsInfo weights_info);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               FullyConnectedLayerInfo    fc_info,
+                               WeightsInfo                weights_info);
 
     //Inherited methods override
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_fc_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
-    void configure_conv_fc(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
-    void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act);
+    void configure_fc_fc(const ITensorInfo         *src,
+                         const ITensorInfo         *weights,
+                         const ITensorInfo         *biases,
+                         ITensorInfo               *dst,
+                         const ActivationLayerInfo &act);
+    void configure_conv_fc(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           ITensorInfo               *dst,
+                           const ActivationLayerInfo &act);
+    void configure_mm(const ITensorInfo         *src,
+                      const ITensorInfo         *weights,
+                      const ITensorInfo         *biases,
+                      ITensorInfo               *dst,
+                      const ActivationLayerInfo &act);
 
     enum AuxTensorIdx
     {

diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
index 34b8459..8da166d 100644
--- a/src/cpu/operators/CpuGemm.cpp
+++ b/src/cpu/operators/CpuGemm.cpp

@@ -24,9 +24,10 @@
 #include "src/cpu/operators/CpuGemm.h"
 
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -57,17 +58,25 @@
 }
 } // namespace
 
-void CpuGemm::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void CpuGemm::configure(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        ITensorInfo       *d,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemm::validate(a, b, c, d, alpha, beta, gemm_info));
     ARM_COMPUTE_LOG_PARAMS(a, b, c, d, alpha, beta, gemm_info);
 
-    const cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool             is_c_bias     = beta == 1 && c != nullptr;
-    bool                   run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
-                                           (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
-                                           !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+    const cpu::AsmGemmInfo asm_info  = init_assembly_metadata(gemm_info);
+    const bool             is_c_bias = beta == 1 && c != nullptr;
+    bool                   run_optimised =
+        bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
+        (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+        !(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
 
     // Check if we need to reshape the matrix B only on the first run
     _is_prepared                      = false;
@@ -76,9 +85,12 @@
     _run_alpha_scale                  = alpha != 1.f;
     _run_bias_addition                = is_c_bias;
     _run_addition                     = beta != 0 && beta != 1 && c != nullptr;
-    _run_activation                   = gemm_info.activation_info().enabled() && (!run_optimised || (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
+    _run_activation =
+        gemm_info.activation_info().enabled() &&
+        (!run_optimised ||
+         (run_optimised && !cpu::CpuGemmAssemblyDispatch::is_activation_supported(gemm_info.activation_info())));
 
-    if(run_optimised)
+    if (run_optimised)
     {
         const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
         _asm_glue                   = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
@@ -90,10 +102,11 @@
         _aux_mem[Pretraspose]      = asm_mem_req[Pretraspose];
 
         // Scale product by alpha
-        if(_run_alpha_scale)
+        if (_run_alpha_scale)
         {
             _alpha_scale_func = std::make_unique<cpu::CpuActivation>();
-            _alpha_scale_func->configure(d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
+            _alpha_scale_func->configure(
+                d, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LINEAR, alpha, 0.f));
         }
     }
     else
@@ -104,7 +117,7 @@
         _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
 
         // Select between GEMV and GEMM
-        if(_run_vector_matrix_multiplication)
+        if (_run_vector_matrix_multiplication)
         {
             // Configure the matrix multiply kernel
             _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
@@ -118,41 +131,50 @@
             // Configure interleave kernel
             _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
             _interleave_kernel->configure(a, &_tmp_a);
-            _aux_mem[InterleavedLHS] = MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
+            _aux_mem[InterleavedLHS] =
+                MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
 
             // Configure transpose kernel
             _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
             _transpose_kernel->configure(b, &_tmp_b);
-            _aux_mem[TransposedRHS] = MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+            _aux_mem[TransposedRHS] =
+                MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
 
             // Configure matrix multiplication kernel
             _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
         }
 
-        if(_run_bias_addition)
+        if (_run_bias_addition)
         {
             _add_bias = std::make_unique<cpu::CpuAdd>();
             _add_bias->configure(gemm_output_to_use, c, d, ConvertPolicy::SATURATE);
-            _aux_mem[TempResult] = MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
+            _aux_mem[TempResult] =
+                MemoryInfo(offset_int_vec(TempResult), MemoryLifetime::Temporary, _tmp_d.total_size());
         }
     }
 
     // Configure matrix addition kernel
-    if(_run_addition)
+    if (_run_addition)
     {
         _ma_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixAdditionKernel>();
         _ma_kernel->configure(c, d, beta);
     }
 
     // Configure activation
-    if(_run_activation)
+    if (_run_activation)
     {
         _activation_func = std::make_unique<cpu::CpuActivation>();
         _activation_func->configure(d, nullptr, gemm_info.activation_info());
     }
 }
 
-Status CpuGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CpuGemm::validate(const ITensorInfo *a,
+                         const ITensorInfo *b,
+                         const ITensorInfo *c,
+                         const ITensorInfo *d,
+                         float              alpha,
+                         float              beta,
+                         const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     const bool is_c_bias    = beta == 1 && c != nullptr;
@@ -162,7 +184,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::BFLOAT16, DataType::F16, DataType::F32);
 
-    if(is_fixed_format_fast_math(gemm_info.weight_format()))
+    if (is_fixed_format_fast_math(gemm_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
@@ -174,46 +196,54 @@
 
     const int block_by = arm_compute::block_by(gemm_info.weight_format());
     // test if im2col has changed the dimensions that are needed for padding
-    if(a->dimension(0) != b->dimension(1) && block_by > 1)
+    if (a->dimension(0) != b->dimension(1) && block_by > 1)
     {
         // have to verify bias
         const size_t dim0_sz = a->dimension(0);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz % block_by) != 0, ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (dim0_sz % block_by) != 0,
+            ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
         // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right
         // b->dimension(1) = kernel_area * input_channel
         // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right
         const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by;
         const size_t kernel_area     = (dim0_sz - b->dimension(1)) / input_pad_right;
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((dim0_sz - kernel_area * input_pad_right) != b->dimension(1), "The product AB is defined only if A number of columns and B number of rows are related");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (dim0_sz - kernel_area * input_pad_right) != b->dimension(1),
+            "The product AB is defined only if A number of columns and B number of rows are related");
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(0) != b->dimension(1), "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            a->dimension(0) != b->dimension(1),
+            "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
-    if(a->data_type() != DataType::BFLOAT16)
+    if (a->data_type() != DataType::BFLOAT16)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, d);
     }
 
-    if(run_addition)
+    if (run_addition)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0), "The C matrix must have the same number of columns as the matrix B");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1),
+                                        "The C matrix must have the same number of rows as the matrix A");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0),
+                                        "The C matrix must have the same number of columns as the matrix B");
     }
 
-    if(d->total_size() != 0)
+    if (d->total_size() != 0)
     {
         // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more.
         ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0));
-        if(gemm_info.depth_output_gemm3d() != 0)
+        if (gemm_info.depth_output_gemm3d() != 0)
         {
-            if(gemm_info.reinterpret_input_as_3d())
+            if (gemm_info.reinterpret_input_as_3d())
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != d->dimension(2));
@@ -230,15 +260,19 @@
     }
 
     // Check if we need to run the optimized assembly kernel
-    cpu::AsmGemmInfo asm_info      = init_assembly_metadata(gemm_info);
-    const bool       run_optimised = bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
-                                     (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
-                                     !(!b->are_values_constant() && b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
+    const bool       run_optimised =
+        bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
+        (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
+        !(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1); // Disable batch matmul as optimized GeMM handles batching differently.
 
-    if(!run_optimised)
+    if (!run_optimised)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "CpuGemm cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "CpuGemm cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(),
+                                        "CpuGemm cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0,
+                                        "CpuGemm cannot reinterpret the output tensor as 3D");
 
         // Check if the first input tensor is a vector.
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
@@ -254,7 +288,8 @@
         int       mult_transpose1xW_width   = 1;
         int       mult_interleave4x4_height = 1;
 
-        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
+        const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(
+            m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
 
         const ITensorInfo *matrix_a_info = a;
         const ITensorInfo *matrix_b_info = b;
@@ -263,39 +298,44 @@
         TensorInfo tmp_b_info{};
         TensorInfo tmp_output_info = *d->clone();
 
-        if(run_interleave_transpose)
+        if (run_interleave_transpose)
         {
             matrix_a_info = &tmp_a_info;
             matrix_b_info = &tmp_b_info;
 
             // Validate interleave kernel
-            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(*a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
+            auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_interleaved_shape(
+                                               *a, mult_interleave4x4_height, gemm_info.reinterpret_input_as_3d())));
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
 
             // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(*b, mult_transpose1xW_width)));
+            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(
+                                               *b, mult_transpose1xW_width)));
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
         }
 
         // Validate matrix multiply
-        auto_init_if_empty(tmp_output_info, matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
+        auto_init_if_empty(tmp_output_info,
+                           matrix_a_info->clone()->set_tensor_shape(compute_mm_shape(
+                               *matrix_a_info, *matrix_b_info, run_interleave_transpose, reshape_info)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixMultiplyKernel::validate(
+            matrix_a_info, matrix_b_info, &tmp_output_info, alpha, run_interleave_transpose, reshape_info));
 
-        if(is_c_bias)
+        if (is_c_bias)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuAdd::validate(&tmp_output_info, c, d, ConvertPolicy::SATURATE));
         }
     }
 
     // Validate matrix addition kernel
-    if(run_addition)
+    if (run_addition)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmMatrixAdditionKernel::validate(c, d, beta));
     }
 
     // Validate activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
+    if (activation.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuActivation::validate(d, nullptr, activation));
     }
@@ -312,15 +352,15 @@
     auto c = tensors.get_const_tensor(ACL_SRC_2);
     auto d = tensors.get_tensor(ACL_DST);
 
-    if(_asm_glue && _asm_glue->is_configured())
+    if (_asm_glue && _asm_glue->is_configured())
     {
         // Pass c to asm dispatch only if it's the bias tensor
         ITensorPack asm_pack = tensors;
         asm_pack.add_const_tensor(ACL_SRC_2, _run_bias_addition ? c : nullptr);
         _asm_glue->run(asm_pack);
-        if(_run_alpha_scale)
+        if (_run_alpha_scale)
         {
-            ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+            ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
             _alpha_scale_func->run(pack);
         }
     }
@@ -330,18 +370,20 @@
         CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
         CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
 
-        ITensorPack mm_pack{ { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_DST, (_run_bias_addition) ? temp_d.get() : d } };
-        if(!_run_vector_matrix_multiplication)
+        ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}};
+        if (!_run_vector_matrix_multiplication)
         {
             // Run interleave kernel
-            ITensorPack interleave_pack{ { ACL_SRC, a }, { ACL_DST, interleaved_a.get() } };
-            NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(), interleave_pack);
+            ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}};
+            NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(),
+                                           interleave_pack);
 
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
-                NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+                ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
+                NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
+                                               transpose_pack);
             }
 
             // Use reshaped matrices
@@ -349,48 +391,52 @@
             mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
         }
 
-        NEScheduler::get().schedule_op(_mm_kernel.get(), _run_vector_matrix_multiplication ? Window::DimX : Window::DimY, _mm_kernel->window(), mm_pack);
+        NEScheduler::get().schedule_op(_mm_kernel.get(),
+                                       _run_vector_matrix_multiplication ? Window::DimX : Window::DimY,
+                                       _mm_kernel->window(), mm_pack);
 
         // Run bias addition kernel
-        if(_run_bias_addition)
+        if (_run_bias_addition)
         {
-            ITensorPack pack{ { ACL_SRC_0, temp_d.get() }, { ACL_SRC_1, c }, { ACL_DST, d } };
+            ITensorPack pack{{ACL_SRC_0, temp_d.get()}, {ACL_SRC_1, c}, {ACL_DST, d}};
             _add_bias->run(pack);
         }
     }
 
     // Run matrix addition kernel
-    if(_run_addition)
+    if (_run_addition)
     {
-        ITensorPack c_add_pack{ { ACL_SRC, c }, { ACL_DST, d } };
+        ITensorPack c_add_pack{{ACL_SRC, c}, {ACL_DST, d}};
         NEScheduler::get().schedule_op(_ma_kernel.get(), Window::DimY, _ma_kernel->window(), c_add_pack);
     }
 
     // Run activation function
-    if(_run_activation)
+    if (_run_activation)
     {
-        ITensorPack pack{ { ACL_SRC, d }, { ACL_DST, d } };
+        ITensorPack pack{{ACL_SRC, d}, {ACL_DST, d}};
         _activation_func->run(pack);
     }
 }
 
 void CpuGemm::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_asm_glue && _asm_glue->is_configured())
+        if (_asm_glue && _asm_glue->is_configured())
         {
             _asm_glue->prepare(tensors);
         }
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
+        else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
         {
-            const ITensor *b     = tensors.get_const_tensor(ACL_SRC_1);
-            ITensor       *b_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
+            const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
+            ITensor       *b_aux =
+                utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
             ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
 
             CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
-            ITensorPack         transpose_pack{ { ACL_SRC, b }, { ACL_DST, transposed_b.get() } };
-            NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(), transpose_pack);
+            ITensorPack         transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
+            NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
+                                           transpose_pack);
         }
         _is_prepared = true;
     }
@@ -401,8 +447,12 @@
     return _aux_mem;
 }
 
-Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                             const GEMMInfo &gemm_info)
+Status CpuGemm::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                             const ITensorInfo         *a,
+                             const ITensorInfo         *b,
+                             const ITensorInfo         *c,
+                             const ITensorInfo         *d,
+                             const GEMMInfo            &gemm_info)
 {
     const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
 

diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h
index 9b08e5d..6b30d13 100644
--- a/src/cpu/operators/CpuGemm.h
+++ b/src/cpu/operators/CpuGemm.h

@@ -24,12 +24,12 @@
 #ifndef ARM_COMPUTE_CPU_GEMM_H
 #define ARM_COMPUTE_CPU_GEMM_H
 
-#include "src/cpu/ICpuOperator.h"
-
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/GEMMInfo.h"
+
+#include "src/cpu/ICpuOperator.h"
 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
 #include "src/cpu/kernels/CpuGemmMatrixAdditionKernel.h"
 #include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"
@@ -93,16 +93,26 @@
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should happen only for the first run
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                   float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *d,
+                   float              alpha,
+                   float              beta,
+                   const GEMMInfo    &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref CpuGemm.
      *
      * Similar to @ref CpuGemm::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                           float alpha, float beta, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *d,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
@@ -111,12 +121,16 @@
      * the value of arm_compute::WeightFormat need to be passed via the
      * parameter gemm_info.
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                               const GEMMInfo &gemm_info = GEMMInfo());
+    static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *d,
+                               const GEMMInfo            &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
     /** Indicates if the convolution executes in variable weights mode.
@@ -138,28 +152,28 @@
         Count
     };
 
-    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>  _interleave_kernel{ nullptr };
-    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose_kernel{ nullptr };
-    std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{ nullptr };
-    std::unique_ptr<CpuGemmAssemblyDispatch>              _asm_glue{ nullptr };
-    std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{ nullptr };
-    std::unique_ptr<CpuActivation>                        _alpha_scale_func{ nullptr };
-    std::unique_ptr<CpuAdd>                               _add_bias{ nullptr };
-    std::unique_ptr<CpuActivation>                        _activation_func{ nullptr };
+    std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>  _interleave_kernel{nullptr};
+    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose_kernel{nullptr};
+    std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr};
+    std::unique_ptr<CpuGemmAssemblyDispatch>              _asm_glue{nullptr};
+    std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr};
+    std::unique_ptr<CpuActivation>                        _alpha_scale_func{nullptr};
+    std::unique_ptr<CpuAdd>                               _add_bias{nullptr};
+    std::unique_ptr<CpuActivation>                        _activation_func{nullptr};
 
     TensorInfo _tmp_a{};
     TensorInfo _tmp_b{};
     TensorInfo _tmp_d{};
 
-    bool _run_vector_matrix_multiplication{ false };
-    bool _run_alpha_scale{ false };
-    bool _run_addition{ false };
-    bool _run_bias_addition{ false };
-    bool _run_activation{ false };
-    bool _reshape_b_only_on_first_run{ false };
-    bool _is_prepared{ false };
+    bool _run_vector_matrix_multiplication{false};
+    bool _run_alpha_scale{false};
+    bool _run_addition{false};
+    bool _run_bias_addition{false};
+    bool _run_activation{false};
+    bool _reshape_b_only_on_first_run{false};
+    bool _is_prepared{false};
 
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index 39b410d..7c59d88 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp

@@ -26,9 +26,9 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
@@ -52,8 +52,11 @@
 {
 namespace cpu
 {
-CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
-                                                        const Size2D &dilation, const ActivationLayerInfo &act_info)
+CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo         *src,
+                                                        const ITensorInfo         *weights,
+                                                        const PadStrideInfo       &conv_info,
+                                                        const Size2D              &dilation,
+                                                        const ActivationLayerInfo &act_info)
 {
     const DataLayout   data_layout   = src->data_layout();
     const int          idx_width     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -62,63 +65,86 @@
     const unsigned int kernel_height = weights->dimension(idx_height);
     unsigned int       conv_w        = 0;
     unsigned int       conv_h        = 0;
-    std::tie(conv_w, conv_h)         = scaled_dimensions(src->dimension(idx_width),
-                                                         src->dimension(idx_height),
-                                                         kernel_width,
-                                                         kernel_height,
-                                                         conv_info,
-                                                         dilation);
-    const bool skip_im2col           = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
+    const bool skip_im2col   = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv_info.stride().first == 1 && conv_info.stride().second == 1);
 
-    if(skip_im2col)
+    if (skip_im2col)
     {
-        const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
-        if(skip_col2im)
+        const bool skip_col2im =
+            (data_layout == DataLayout::NHWC &&
+             (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ true))));
+        if (skip_col2im)
         {
-            return { true, true };
+            return {true, true};
         }
     }
     else
     {
-        const bool skip_col2im = (data_layout == DataLayout::NHWC && (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
-        if(skip_col2im)
+        const bool skip_col2im =
+            (data_layout == DataLayout::NHWC &&
+             (bool(CpuGemmConv2d::validate_gemm3d(src, weights, act_info, conv_h, /*skip_im2col*/ false))));
+        if (skip_col2im)
         {
-            return { false, true };
+            return {false, true};
         }
     }
 
     // Default case when we cannot reinterpret the input and output as 3D.
-    return { false, false };
+    return {false, false};
 }
 
 CpuGemmConv2d::CpuGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
-      _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+    : _weights_reshape_kernel(nullptr),
+      _im2col_kernel(),
+      _mm_gemm(),
+      _mm_gemmlowp(),
+      _col2im_kernel(),
+      _reshape(),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _gemm_output_3d(),
+      _data_layout(DataLayout::NCHW),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
 {
 }
 CpuGemmConv2d::~CpuGemmConv2d() = default;
 
-void CpuGemmConv2d::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info,
-                                 bool enable_fast_math, int gemm_3d_depth, bool fixed_format, arm_compute::WeightFormat weight_format)
+void CpuGemmConv2d::configure_mm(const ITensorInfo         *src,
+                                 const ITensorInfo         *weights,
+                                 const ITensorInfo         *biases,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math,
+                                 int                        gemm_3d_depth,
+                                 bool                       fixed_format,
+                                 arm_compute::WeightFormat  weight_format)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth, _skip_im2col, fixed_format, weight_format));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth,
+                                           _skip_im2col, fixed_format, weight_format));
 
     // Create GEMMInfo structure
-    const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                         gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                         false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
+    const GEMMInfo &gemm_info =
+        GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                 _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+                 false, enable_fast_math, false, act_info, fixed_format, weight_format);
 
     // Supported activations in GEMM
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+        ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
-        TensorInfo tmp_src{ *src };
-        TensorInfo tmp_weights{ *weights };
+        TensorInfo tmp_src{*src};
+        TensorInfo tmp_weights{*weights};
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo        iqinfo    = src->quantization_info();
@@ -129,7 +155,7 @@
         const DataType                data_type = src->data_type();
 
         tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
-        if(!is_data_type_quantized_per_channel(tmp_weights.data_type()))
+        if (!is_data_type_quantized_per_channel(tmp_weights.data_type()))
         {
             const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
             tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
@@ -142,7 +168,7 @@
         int32_t min_activation       = type_min.get<int32_t>();
         int32_t max_activation       = type_max.get<int32_t>();
 
-        if(supported_acts.count(act_info.activation()) != 0)
+        if (supported_acts.count(act_info.activation()) != 0)
         {
             std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
         }
@@ -156,11 +182,12 @@
         quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
 
         _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
-        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, enable_fast_math, false, act_info, fixed_format,
-                                                                              weight_format));
+        _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst,
+                                GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false,
+                                         enable_fast_math, false, act_info, fixed_format, weight_format));
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
@@ -171,26 +198,35 @@
         _mm_gemm = std::make_unique<CpuGemm>();
         _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
         auto mm_mem_req = _mm_gemm->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
     }
 }
 
-Status CpuGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                  const ActivationLayerInfo &act_info, bool enable_fast_math, int gemm_3d_depth, bool skip_im2col, bool fixed_format, arm_compute::WeightFormat weight_format)
+Status CpuGemmConv2d::validate_mm(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math,
+                                  int                        gemm_3d_depth,
+                                  bool                       skip_im2col,
+                                  bool                       fixed_format,
+                                  arm_compute::WeightFormat  weight_format)
 {
     const DataType data_type             = src->data_type();
     const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
     const bool     is_activation_enabled = act_info.enabled();
 
     // Create GEMMInfo structure
-    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                        gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                        false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format);
+    const GEMMInfo gemm_info =
+        GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                 skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+                 false, enable_fast_math, false, act_info, fixed_format, weight_format);
 
-    if(is_quantized)
+    if (is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
@@ -206,11 +242,10 @@
         int32_t min_activation       = type_min.get<int32_t>();
         int32_t max_activation       = type_max.get<int32_t>();
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
-        if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+        if (is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
         {
             std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
         }
@@ -229,8 +264,9 @@
         input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
         weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
 
-        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, enable_fast_math,
-                                                                                                               false, act_info));
+        return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst,
+                                                       GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false,
+                                                                output_info, false, enable_fast_math, false, act_info));
     }
     else
     {
@@ -239,36 +275,44 @@
     }
 }
 
-Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+Status CpuGemmConv2d::validate_gemm3d(const ITensorInfo         *input_info,
+                                      const ITensorInfo         *weights_info,
+                                      const ActivationLayerInfo &act_info,
+                                      int                        gemm_3d_depth,
+                                      bool                       skip_im2col)
 {
     const DataType     data_type = input_info->data_type();
     const unsigned int mult_y    = skip_im2col ? 1U : gemm_3d_depth;
     const unsigned int mult_z    = skip_im2col ? gemm_3d_depth : 1U;
 
     // Set dummy tensor shapes for the validation
-    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
+    const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type,
+                                      input_info->quantization_info());
     const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
-    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+    const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type,
+                                       input_info->quantization_info());
 
-    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false, gemm_3d_depth, skip_im2col);
+    return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, false,
+                       gemm_3d_depth, skip_im2col);
 }
 
-void CpuGemmConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                              const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CpuGemmConv2d::configure(const ITensorInfo         *src,
+                              const ITensorInfo         *weights,
+                              const ITensorInfo         *biases,
+                              ITensorInfo               *dst,
+                              const PadStrideInfo       &conv_info,
+                              const WeightsInfo         &weights_info,
+                              const Size2D              &dilation,
+                              const ActivationLayerInfo &act_info,
+                              bool                       enable_fast_math,
+                              unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_UNUSED(num_groups, weights_info);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src,
-                                                       weights,
-                                                       biases,
-                                                       dst,
-                                                       conv_info,
-                                                       weights_info,
-                                                       dilation,
-                                                       act_info,
-                                                       enable_fast_math,
-                                                       num_groups));
-    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConv2d::validate(src, weights, biases, dst, conv_info, weights_info, dilation,
+                                                       act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math,
+                           num_groups);
 
     const DataType   data_type   = src->data_type();
     const DataLayout data_layout = src->data_layout();
@@ -283,7 +327,8 @@
     _is_prepared  = weights_info.retain_internal_weights();
     _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
     _data_layout  = data_layout;
-    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv_info.stride().first == 1 && conv_info.stride().second == 1);
 
     const ITensorInfo *gemm_input_to_use  = src;
     ITensorInfo       *gemm_output_to_use = dst;
@@ -291,20 +336,17 @@
     // Get convolved dimensions
     unsigned int conv_w      = 0;
     unsigned int conv_h      = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
 
     ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
     // Check if GEMM3D is supported
-    const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
-    _skip_im2col                            = skip_info.skip_im2col;
-    _skip_col2im                            = skip_info.skip_col2im;
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+    _skip_im2col = skip_info.skip_im2col;
+    _skip_col2im = skip_info.skip_col2im;
 
     // Get parameters from conv_info
     unsigned int stride_x        = 0;
@@ -320,17 +362,19 @@
     _weights_reshaped.set_quantization_info(weights->quantization_info());
 
     // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         const int    block_by        = arm_compute::block_by(weights_info.weight_format());
         unsigned int input_pad_right = 0;
-        if(block_by > 1)
+        if (block_by > 1)
         {
-            input_pad_right = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+            input_pad_right =
+                (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
         }
         // Configure
         _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
-        _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation, num_groups, input_pad_right);
+        _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation,
+                                  num_groups, input_pad_right);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
@@ -338,7 +382,7 @@
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
     const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -368,9 +412,10 @@
     // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
-    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math, gemm_3d_depth, fixed_format, weights_info.weight_format());
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math,
+                 gemm_3d_depth, fixed_format, weights_info.weight_format());
 
-    if(!_skip_col2im && _data_layout == DataLayout::NCHW)
+    if (!_skip_col2im && _data_layout == DataLayout::NCHW)
     {
         // Configure col2im
         _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
@@ -390,14 +435,24 @@
     gemm_trans_wei      = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS
 
     // Check lifetime
-    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent, _weights_reshaped.total_size());
-    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped),
+                                           gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent,
+                                           _weights_reshaped.total_size());
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
 }
 
-Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   const PadStrideInfo &conv_info,
-                                   const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math)
+Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                   const ITensorInfo         *src,
+                                   const ITensorInfo         *weights,
+                                   const ITensorInfo         *biases,
+                                   const ITensorInfo         *dst,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   const bool                 enable_fast_math)
 {
     const DataLayout   data_layout   = src->data_layout();
     const int          idx_width     = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
@@ -406,36 +461,44 @@
     const unsigned int kernel_height = weights->dimension(idx_height);
     unsigned int       conv_w        = 0;
     unsigned int       conv_h        = 0;
-    std::tie(conv_w, conv_h)         = scaled_dimensions(src->dimension(idx_width),
-                                                         src->dimension(idx_height),
-                                                         kernel_width,
-                                                         kernel_height,
-                                                         conv_info,
-                                                         dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
 
-    const CpuGemmConv2d::SkipInfo skip_info = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
-                                                                              dilation, act_info);
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
 
     const bool         skip_im2col   = skip_info.skip_im2col;
     const bool         skip_col2im   = skip_info.skip_col2im;
     const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0;
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
-    const GEMMInfo     gemm_info     = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
-                                                gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
-                                                false, GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
+    const GEMMInfo     gemm_info =
+        GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                 skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
+                 false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
 
     return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
 }
 
-Status CpuGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                               const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CpuGemmConv2d::validate(const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *dst,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info,
+                               const Size2D              &dilation,
+                               const ActivationLayerInfo &act_info,
+                               bool                       enable_fast_math,
+                               unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
 
-    if(!is_fixed_format(weights_info.weight_format()))
+    if (!is_fixed_format(weights_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
     }
@@ -468,29 +531,25 @@
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv_info,
-                                                 dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv_info, dilation);
 
     // Check if GEMM3D is supported
-    const CpuGemmConv2d::SkipInfo skip_info   = CpuGemmConv2d::skip_im_col_info(src, weights, conv_info,
-                                                                                dilation, act_info);
-    const bool                    skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
+    const CpuGemmConv2d::SkipInfo skip_info =
+        CpuGemmConv2d::skip_im_col_info(src, weights, conv_info, dilation, act_info);
+    const bool skip_im2col = skip_info.skip_im2col, skip_col2im = skip_info.skip_col2im;
 
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
-        else if(is_bf16)
+        else if (is_bf16)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
         }
@@ -503,20 +562,23 @@
     }
 
     unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-    unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
+    unsigned int mat_weights_rows =
+        weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
 
     weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, weights->data_type());
     weights_reshaped_info.set_quantization_info(weights->quantization_info());
     weights_to_use = &weights_reshaped_info;
 
-    if(!skip_im2col)
+    if (!skip_im2col)
     {
         const int block_by        = arm_compute::block_by(weights_info.weight_format());
         int       input_pad_right = 0;
-        if(block_by > 1)
+        if (block_by > 1)
         {
-            input_pad_right  = (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
-            mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * (weights->dimension(idx_channel) + input_pad_right);
+            input_pad_right =
+                (src->dimension(idx_channel) % block_by) == 0 ? 0 : block_by - (src->dimension(idx_channel) % block_by);
+            mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) *
+                               (weights->dimension(idx_channel) + input_pad_right);
         }
 
         // Create tensor info for im2col reshaped inputs
@@ -528,13 +590,15 @@
 
         im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
         im2col_reshaped_info.set_quantization_info(src->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation, num_groups, input_pad_right));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height),
+                                               conv_info, append_bias, dilation, num_groups, input_pad_right));
         gemm_input_to_use = &im2col_reshaped_info;
     }
 
     // Create temporary GEMM output tensor in case we cannot skip col2im
     const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
         TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
         shape_gemm.set(0, mat_weights_cols);
@@ -549,13 +613,15 @@
     gemm_output_to_use      = &info_gemm;
     const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info,
+                                            enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
                                             weights_info.weight_format()));
 
     // Validate Col2Im/ReshapeLayer
-    if(!skip_col2im && (data_layout == DataLayout::NCHW))
+    if (!skip_col2im && (data_layout == DataLayout::NCHW))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
     }
 
     return Status{};
@@ -574,15 +640,11 @@
     CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
 
     bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         // Run input reshaping
         unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        ITensorPack  pack  =
-        {
-            { TensorType::ACL_SRC, src },
-            { TensorType::ACL_DST, im2col_output.get() }
-        };
+        ITensorPack  pack  = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
         NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
         gemm_input_to_use = im2col_output.get();
     }
@@ -595,11 +657,11 @@
     gemm3d.allocator()->import_memory(out_to_use->buffer());
     auto gemm_output_to_use = gemm_output.get();
 
-    if(_skip_im2col)
+    if (_skip_im2col)
     {
         gemm_output_to_use = &gemm3d;
     }
-    if(_skip_col2im && !out_has_padding)
+    if (_skip_col2im && !out_has_padding)
     {
         gemm_output_to_use = dst;
     }
@@ -607,12 +669,12 @@
     // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
-    if(!this->isVarWeightsKernel())
+    if (!this->isVarWeightsKernel())
     {
         pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
     }
     pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // Run gemmlowp
         _mm_gemmlowp->run(pack_mm);
@@ -624,45 +686,33 @@
     }
 
     // Reshape output matrix
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
-        if(_data_layout == DataLayout::NCHW)
+        if (_data_layout == DataLayout::NCHW)
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, gemm_output.get() },
-                { TensorType::ACL_DST, dst }
-            };
+            ITensorPack pack = {{TensorType::ACL_SRC, gemm_output.get()}, {TensorType::ACL_DST, dst}};
             NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
         }
         else
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, gemm_output_to_use },
-                { TensorType::ACL_DST, dst }
-            };
+            ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
             _reshape->run(pack);
         }
     }
-    else if(out_has_padding)
+    else if (out_has_padding)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, gemm_output_to_use },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
         _reshape->run(pack);
     }
 }
 
 void CpuGemmConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Variable weights executions that use fixed-format kernels
         // need no reshaping of the weights.
-        if(this->isVarWeightsKernel())
+        if (this->isVarWeightsKernel())
         {
             _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
             _is_prepared = true;
@@ -672,11 +722,7 @@
         // Run weights reshaping and mark original weights tensor as unused
         CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
         auto                weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack         pack    =
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, weights_reshaped.get() }
-        };
+        ITensorPack         pack    = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
         NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
         weights->mark_as_unused();
         ITensorPack gemm_pack = tensors;

diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h
index 61fe63a..118d366 100644
--- a/src/cpu/operators/CpuGemmConv2d.h
+++ b/src/cpu/operators/CpuGemmConv2d.h

@@ -27,6 +27,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 #include <memory>
@@ -106,17 +107,32 @@
      *                              available which may introduce a drop of accuracy as well. Default is false
      * @param[in]  num_groups       (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
-                   const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false, unsigned int num_groups = 1);
+    void configure(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const WeightsInfo         &weights_info     = WeightsInfo(),
+                   const Size2D              &dilation         = Size2D(1U, 1U),
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false,
+                   unsigned int               num_groups       = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                           const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                           bool enable_fast_math = false, unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *output,
+                           const PadStrideInfo       &conv_info,
+                           const WeightsInfo         &weights_info     = WeightsInfo(),
+                           const Size2D              &dilation         = Size2D(1U, 1U),
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false,
+                           unsigned int               num_groups       = 1);
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
@@ -124,10 +140,16 @@
      *
      * @return a status.
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                               const PadStrideInfo &conv_info,
-                               const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                               const bool enable_fast_math = false);
+    static Status has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                               const ITensorInfo         *src,
+                               const ITensorInfo         *weights,
+                               const ITensorInfo         *biases,
+                               const ITensorInfo         *output,
+                               const PadStrideInfo       &conv_info,
+                               const WeightsInfo         &weights_info     = WeightsInfo(),
+                               const Size2D              &dilation         = Size2D(1U, 1U),
+                               const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                               const bool                 enable_fast_math = false);
 
     // Inherited methods overridden:
     void                             run(ITensorPack &tensors) override;
@@ -150,8 +172,15 @@
      * @param[in]  fixed_format     (Optional) Select GEMM execution with variable weights.
      * @param[in]  weight_format    (Optional) The layout to be used for the weights tensor when running GEMM with variable weights.
      */
-    void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                      bool enable_fast_math = false, int gemm_3d_depth = 1, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+    void configure_mm(const ITensorInfo         *src,
+                      const ITensorInfo         *weights,
+                      const ITensorInfo         *biases,
+                      ITensorInfo               *output,
+                      const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                      bool                       enable_fast_math = false,
+                      int                        gemm_3d_depth    = 1,
+                      bool                       fixed_format     = false,
+                      arm_compute::WeightFormat  weight_format    = arm_compute::WeightFormat::UNSPECIFIED);
     /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
      *
      * @param[in] src              Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -170,8 +199,16 @@
      *
      * @return a status
      */
-    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
-                              bool enable_fast_math = false, int gemm_3d_depth = 1, bool skip_im2col = false, bool fixed_format = false, arm_compute::WeightFormat weight_format = arm_compute::WeightFormat::UNSPECIFIED);
+    static Status validate_mm(const ITensorInfo         *src,
+                              const ITensorInfo         *weights,
+                              const ITensorInfo         *biases,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                              bool                       enable_fast_math = false,
+                              int                        gemm_3d_depth    = 1,
+                              bool                       skip_im2col      = false,
+                              bool                       fixed_format     = false,
+                              arm_compute::WeightFormat  weight_format    = arm_compute::WeightFormat::UNSPECIFIED);
     /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
      *
      * @param[in] src           Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
@@ -182,7 +219,11 @@
      *
      * @return a status
      */
-    static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+    static Status validate_gemm3d(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ActivationLayerInfo &act_info,
+                                  int                        gemm_3d_depth,
+                                  bool                       skip_im2col);
 
     struct SkipInfo
     {
@@ -200,8 +241,11 @@
      *
      * @return a SkipInfo instance.
      */
-    static SkipInfo skip_im_col_info(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info,
-                                     const Size2D &dilation, const ActivationLayerInfo &act_info);
+    static SkipInfo skip_im_col_info(const ITensorInfo         *src,
+                                     const ITensorInfo         *weights,
+                                     const PadStrideInfo       &conv_info,
+                                     const Size2D              &dilation,
+                                     const ActivationLayerInfo &act_info);
 
     /** Indicates if the convolution executes in variable weights mode.
      *
@@ -236,7 +280,7 @@
     bool _is_quantized;
     bool _is_prepared;
 
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp
index 5ce285c..8fa81b1 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp

@@ -26,10 +26,10 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
-
 #include "support/Cast.h"
 
 #include <set>
@@ -43,7 +43,10 @@
 
 namespace
 {
-GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo         *src,
+                                                        const ITensorInfo         *weights,
+                                                        const ITensorInfo         *dst,
+                                                        const ActivationLayerInfo &act)
 {
     // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
     // Extract and negate input and weights offset
@@ -53,16 +56,15 @@
     const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
     const DataType                data_type = src->data_type();
     // Merge activation with output stage
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
-    PixelValue                                              type_min{};
-    PixelValue                                              type_max{};
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+        ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
+    PixelValue type_min{};
+    PixelValue type_max{};
     std::tie(type_min, type_max) = get_min_max(data_type);
     int32_t min_activation       = type_min.get<int32_t>();
     int32_t max_activation       = type_max.get<int32_t>();
-    if(supported_acts.count(act.activation()) != 0)
+    if (supported_acts.count(act.activation()) != 0)
     {
         std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
     }
@@ -107,31 +109,32 @@
 
 CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
 
-void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info)
+void CpuGemmDirectConv2d::configure(const ITensorInfo *src,
+                                    const ITensorInfo *weights,
+                                    const ITensorInfo *biases,
+                                    ITensorInfo       *dst,
+                                    const Conv2dInfo  &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src,
-                                                             weights,
-                                                             biases != nullptr ? biases : nullptr,
-                                                             dst,
-                                                             info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        CpuGemmDirectConv2d::validate(src, weights, biases != nullptr ? biases : nullptr, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, info);
 
     _run_activation = info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info);
     _is_prepared    = false;
 
-    _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{ 3, 0, 1, 2 });
+    _weights_permute_func->configure(weights, &_perm_weights, PermutationVector{3, 0, 1, 2});
 
     // Configure assembly dispatch
     cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
-    if(is_data_type_quantized(src->data_type()))
+    if (is_data_type_quantized(src->data_type()))
     {
         asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
     }
     _gemm_asm_func->configure(src, &_perm_weights, biases, dst, asm_info);
 
     // Configure activation
-    if(_run_activation)
+    if (_run_activation)
     {
         _activation_func->configure(dst, nullptr, info.act_info);
     }
@@ -141,24 +144,33 @@
     _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
     _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
 
-    if(_aux_mem[Pretranspose].size > 0)
+    if (_aux_mem[Pretranspose].size > 0)
     {
         // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-        _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
+        _aux_mem[PermutedWeights] =
+            MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, weights->total_size());
     }
     else
     {
         // We must permute weights if they are WeightFormat::UNSPECIFIED
-        if(info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
-            _aux_mem[PermutedWeights] = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
+        if (info.weights_info.weight_format() == WeightFormat::UNSPECIFIED)
+            _aux_mem[PermutedWeights] =
+                MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Persistent, weights->total_size());
     }
 }
-Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
+Status CpuGemmDirectConv2d::validate(const ITensorInfo *src,
+                                     const ITensorInfo *weights,
+                                     const ITensorInfo *biases,
+                                     const ITensorInfo *dst,
+                                     const Conv2dInfo  &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    if(!is_fixed_format(info.weights_info.weight_format()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
+    if (!is_fixed_format(info.weights_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
     }
@@ -171,13 +183,13 @@
     ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(data_type))
+        if (is_data_type_quantized_asymmetric(data_type))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
-        else if(data_type == DataType::BFLOAT16)
+        else if (data_type == DataType::BFLOAT16)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
         }
@@ -198,31 +210,32 @@
     prepare(tensors);
 
     _gemm_asm_func->run(tensors);
-    if(_run_activation)
+    if (_run_activation)
     {
         ITensor    *io = tensors.get_tensor(ACL_DST);
-        ITensorPack pack{ { ACL_SRC, io }, { ACL_DST, io } };
+        ITensorPack pack{{ACL_SRC, io}, {ACL_DST, io}};
         _activation_func->run(pack);
     }
 }
 
 void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // If we are using fixed-format kernel the weights are already reshaped
-        if(_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
+        if (_gemm_asm_func && _gemm_asm_func->isVarWeightsKernel())
         {
             _gemm_asm_func->prepare(tensors);
             _is_prepared = true;
             return;
         }
-        const ITensor *weights     = tensors.get_const_tensor(ACL_SRC_1);
-        ITensor       *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+        const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+        ITensor       *weights_aux =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
         ARM_COMPUTE_ERROR_ON_NULLPTR(weights, weights_aux);
 
         CpuAuxTensorHandler permuted_weights(_perm_weights, *weights_aux);
-        ITensorPack         permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+        ITensorPack         permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
         _weights_permute_func->run(permute_tensors);
 
         tensors.add_const_tensor(ACL_SRC_1, permuted_weights.get());

diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h
index e55a461..1cc3caa 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.h
+++ b/src/cpu/operators/CpuGemmDirectConv2d.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
 
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/operators/CpuActivation.h"
@@ -69,18 +70,26 @@
      *                    Data types supported: Same as @p input.
      * @param[in] info    Contains padding and stride information described in @ref PadStrideInfo.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info);
+    void configure(const ITensorInfo *src,
+                   const ITensorInfo *weights,
+                   const ITensorInfo *biases,
+                   ITensorInfo       *dst,
+                   const Conv2dInfo  &info);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
      *
      * Similar to CpuGemmDirectConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 8ca128f..2ee879b 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp

@@ -28,14 +28,14 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/TensorAllocator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/kernels/CpuConvertQuantizedSignednessKernel.h"
 #include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"
 #include "src/cpu/kernels/CpuGemmLowpMatrixMultiplyKernel.h"
@@ -59,12 +59,12 @@
 cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
 {
     cpu::AsmGemmInfo asm_info;
-    asm_info.method                      = cpu::AsmConvMethod::Im2Col;
-    asm_info.reinterpret_input_as_3d     = info.reinterpret_input_as_3d();
-    asm_info.depth_output_gemm3d         = info.depth_output_gemm3d();
-    asm_info.activation_info             = info.activation_info();
-    asm_info.output_stage                = info.gemmlowp_output_stage();
-    asm_info.fast_mode                   = info.fast_math();
+    asm_info.method                  = cpu::AsmConvMethod::Im2Col;
+    asm_info.reinterpret_input_as_3d = info.reinterpret_input_as_3d();
+    asm_info.depth_output_gemm3d     = info.depth_output_gemm3d();
+    asm_info.activation_info         = info.activation_info();
+    asm_info.output_stage            = info.gemmlowp_output_stage();
+    asm_info.fast_mode               = info.fast_math();
 
     return asm_info;
 }
@@ -105,7 +105,8 @@
 }
 CpuGemmLowpMatrixMultiplyCore::~CpuGemmLowpMatrixMultiplyCore() = default;
 
-void CpuGemmLowpMatrixMultiplyCore::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
+void CpuGemmLowpMatrixMultiplyCore::configure(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, dst);
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpMatrixMultiplyCore::validate(a, b, c, dst, gemm_info));
@@ -122,28 +123,31 @@
     _reshape_b_only_on_first_run      = b->are_values_constant();
     _is_prepared                      = false;
     _fused_assembly_path              = false;
-    _flip_signedness                  = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && _reshape_b_only_on_first_run;
-    _gemm_info                        = gemm_info;
+    _flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) &&
+                       _reshape_b_only_on_first_run;
+    _gemm_info = gemm_info;
 
     _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
 
     const ITensorInfo *a_to_use = a;
 
     // Convert to QASYMM8 -> QASYMM8_SIGNED and back
-    if(_flip_signedness)
+    if (_flip_signedness)
     {
         const int32_t                 offset_correction = 128;
         const DataType                dt                = DataType::QASYMM8_SIGNED;
         const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
 
-        _signed_a                = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        _signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+            QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
         _convert_to_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
         _convert_to_signed_asymm->configure(a_to_use, &_signed_a);
         a_to_use  = &_signed_a;
         _a_offset = _signed_a.quantization_info().uniform().offset;
 
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
-        _signed_output                       = dst->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+        _signed_output                       = dst->clone()->set_data_type(dt).set_quantization_info(
+                                  QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
 
         // Output stage correction
         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -157,7 +161,7 @@
     }
 
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         _fuse_output_stage = true;
         _mm_result_s32     = TensorInfo(dst->tensor_shape(), 1, DataType::S32);
@@ -166,16 +170,18 @@
     // Initialize assembly kernel meta-data
     const cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
 #ifdef __aarch64__
-    if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+    if (!(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
     {
-        switch(a->data_type())
+        switch (a->data_type())
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
             case DataType::U8:
             case DataType::S8:
             {
-                if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+                if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+                    info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
                 {
                     auto c_info_to_use = c == nullptr ? nullptr : c;
                     _asm_glue->configure(a_to_use, b, c_info_to_use, dst, asm_info);
@@ -197,13 +203,14 @@
         }
     }
 #endif /* __aarch64__ */
-    if(!(_assembly_path || _run_vector_matrix_multiplication))
+    if (!(_assembly_path || _run_vector_matrix_multiplication))
     {
         matrix_a = &_tmp_a;
         matrix_b = &_tmp_b;
 
         // The interleaved output matrix will have the following shape: [ a_height * 4, ceil(a_width / 4.0f) ]
-        _tmp_a = TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
+        _tmp_a =
+            TensorInfo(compute_interleaved_shape(*a_to_use), 1, a_to_use->data_type(), a_to_use->quantization_info());
         // The transpose1xW output matrix will have the following shape: [ b_height * 16, ceil(b_width / 16.0f) ]
         _tmp_b = TensorInfo(compute_transpose1xW_shape(*b), 1, b->data_type(), b->quantization_info());
 
@@ -216,13 +223,13 @@
         _mtx_b_reshape_kernel->configure(b, &_tmp_b);
     }
 
-    if(!_fused_assembly_path)
+    if (!_fused_assembly_path)
     {
         // Build reduction info
         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
 
         // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0)
+        if (_a_offset != 0)
         {
             _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
@@ -232,7 +239,7 @@
         }
 
         // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
+        if (_b_offset != 0)
         {
             _vector_sum_row = TensorInfo(compute_reductionB_shape(*a_to_use), 1, DataType::S32);
 
@@ -241,24 +248,23 @@
             _mtx_a_reduction_kernel->configure(a_to_use, &_vector_sum_row, reduction_info);
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             // Configure matrix multiply kernel
-            if(!_assembly_path)
+            if (!_assembly_path)
             {
                 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
                 _mm_kernel->configure(matrix_a, matrix_b, &_mm_result_s32);
             }
 
-            _offset_contribution_output_stage_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
-            _offset_contribution_output_stage_kernel->configure(&_mm_result_s32,
-                                                                _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                                _b_offset == 0 ? nullptr : &_vector_sum_row, c,
-                                                                _flip_signedness ? &_signed_output : dst,
-                                                                a->dimension(0),
-                                                                _a_offset, _b_offset, info.gemmlowp_output_stage());
+            _offset_contribution_output_stage_kernel =
+                std::make_unique<kernels::CpuGemmLowpOffsetContributionOutputStageKernel>();
+            _offset_contribution_output_stage_kernel->configure(
+                &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c, _flip_signedness ? &_signed_output : dst,
+                a->dimension(0), _a_offset, _b_offset, info.gemmlowp_output_stage());
 
-            if(_flip_signedness)
+            if (_flip_signedness)
             {
                 _convert_from_signed_asymm = std::make_unique<kernels::CpuConvertQuantizedSignednessKernel>();
                 _convert_from_signed_asymm->configure(&_signed_output, dst);
@@ -267,27 +273,29 @@
         else
         {
             // Configure matrix multiply kernel
-            if(!_assembly_path)
+            if (!_assembly_path)
             {
                 _mm_kernel = std::make_unique<kernels::CpuGemmLowpMatrixMultiplyKernel>();
                 _mm_kernel->configure(matrix_a, matrix_b, dst);
             }
             // Configure offset contribution kernel
             _offset_contribution_kernel = std::make_unique<kernels::CpuGemmLowpOffsetContributionKernel>();
-            _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
+            _offset_contribution_kernel->configure(dst, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                                   _b_offset == 0 ? nullptr : &_vector_sum_row, a_to_use->dimension(0),
                                                    _a_offset, _b_offset);
         }
     }
     // Configure activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    _run_activation                       = activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
-    if(_run_activation)
+    _run_activation =
+        activation.enabled() && (!_assembly_path || !cpu::CpuGemmAssemblyDispatch::is_activation_supported(activation));
+    if (_run_activation)
     {
         _activation_func = std::make_unique<CpuActivation>();
         _activation_func->configure(dst, nullptr, activation);
     }
 
-    if(_assembly_path)
+    if (_assembly_path)
     {
         auto asm_mem_req           = _asm_glue->workspace();
         _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
@@ -295,27 +303,41 @@
     }
 
     // Request memory for LHS and RHS reshape matrix
-    _aux_mem[VectorSumCol] = MemoryInfo(offset_int_vec(VectorSumCol), !_fused_assembly_path && _a_offset != 0
-                                        && _reshape_b_only_on_first_run ?
-                                        MemoryLifetime::Persistent :
-                                        MemoryLifetime::Temporary,
-                                        _vector_sum_col.total_size());
-    _aux_mem[VectorSumRow] = MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
-    _aux_mem[TmpA]         = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[TmpB]         = MemoryInfo(offset_int_vec(TmpB), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
-    _aux_mem[MMResultS32]  = MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
-    _aux_mem[SignedA]      = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
-    _aux_mem[SignedOutput] = MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
+    _aux_mem[VectorSumCol] =
+        MemoryInfo(offset_int_vec(VectorSumCol),
+                   !_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run ? MemoryLifetime::Persistent
+                                                                                           : MemoryLifetime::Temporary,
+                   _vector_sum_col.total_size());
+    _aux_mem[VectorSumRow] =
+        MemoryInfo(offset_int_vec(VectorSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+    _aux_mem[TmpA] = MemoryInfo(offset_int_vec(TmpA), MemoryLifetime::Temporary, _tmp_a.total_size());
+    _aux_mem[TmpB] = MemoryInfo(offset_int_vec(TmpB),
+                                _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                                _tmp_b.total_size());
+    _aux_mem[MMResultS32] =
+        MemoryInfo(offset_int_vec(MMResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[SignedA] = MemoryInfo(offset_int_vec(SignedA), MemoryLifetime::Temporary, _signed_a.total_size());
+    _aux_mem[SignedOutput] =
+        MemoryInfo(offset_int_vec(SignedOutput), MemoryLifetime::Temporary, _signed_output.total_size());
 }
 
-Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CpuGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE, "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((a)->dimension(0) != (b)->dimension(1),
-                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(c != nullptr &&
+                                        gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::NONE,
+                                    "Bias addition not supported in NEGEMMLowpMatrixMultiplyCore for output S32");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (a)->dimension(0) != (b)->dimension(1),
+        "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
 
@@ -333,28 +355,32 @@
     int32_t b_offset = b->quantization_info().uniform().offset;
 
     bool fuse_output_stage = info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE;
-    if(fuse_output_stage)
+    if (fuse_output_stage)
     {
-        auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
+        auto_init_if_empty(mm_result_s32_info,
+                           a->clone()->set_tensor_shape(output->tensor_shape()).set_data_type(DataType::S32));
     }
 
     // Convert QASYMM8->QASYMM8_SIGNED
     TensorInfo signed_a{};
     TensorInfo signed_output{};
-    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) && (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
-    if(flip_signedness)
+    bool       flip_signedness = is_data_type_quantized_per_channel(b->data_type()) &&
+                           (a->data_type() == DataType::QASYMM8) && info.reshape_b_only_on_first_run();
+    if (flip_signedness)
     {
         const int32_t                 offset_correction = 128;
         const DataType                dt                = DataType::QASYMM8_SIGNED;
         const UniformQuantizationInfo iqinfo            = a_to_use->quantization_info().uniform();
 
-        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
+        signed_a = a_to_use->clone()->set_data_type(dt).set_quantization_info(
+            QuantizationInfo(iqinfo.scale, iqinfo.offset + offset_correction));
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuConvertQuantizedSignednessKernel::validate(a_to_use, &signed_a));
         a_to_use = &signed_a;
         a_offset = signed_a.quantization_info().uniform().offset;
 
         const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();
-        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
+        signed_output                        = output->clone()->set_data_type(dt).set_quantization_info(
+                                   QuantizationInfo(oqinfo.scale, oqinfo.offset - offset_correction));
 
         // Output stage correction
         GEMMLowpOutputStageInfo output_stage_corr = info.gemmlowp_output_stage();
@@ -374,25 +400,28 @@
     bool run_optimised             = false;
     bool run_optimised_requantized = false;
 
-    if(!(!b->are_values_constant() && b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
+    if (!(!b->are_values_constant() &&
+          b->tensor_shape().z() > 1)) // Disable batch matmul as optimized GeMM handles batching differently.
     {
-        if(is_data_type_quantized_asymmetric(a_to_use->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (is_data_type_quantized_asymmetric(a_to_use->data_type()) &&
+            info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             run_optimised             = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, c, output, asm_info));
             run_optimised_requantized = run_optimised;
         }
         else
         {
-            run_optimised = bool(CpuGemmAssemblyDispatch::validate(a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
+            run_optimised = bool(CpuGemmAssemblyDispatch::validate(
+                a_to_use, b, nullptr, fuse_output_stage ? &mm_result_s32_info : output, asm_info));
         }
     }
 
-    if(run_optimised)
+    if (run_optimised)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(info.depth_output_gemm3d() != 0)
+        if (info.depth_output_gemm3d() != 0)
         {
-            if(info.reinterpret_input_as_3d())
+            if (info.reinterpret_input_as_3d())
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != output->dimension(1));
                 ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(2) != output->dimension(2));
@@ -409,11 +438,13 @@
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(),
+                                        "NEGEMM cannot reinterpret the input tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0,
+                                        "NEGEMM cannot reinterpret the output tensor as 3D");
 
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
-        if(!run_vector_matrix_multiplication)
+        if (!run_vector_matrix_multiplication)
         {
             matrix_a_info = &tmp_a_info;
             matrix_b_info = &tmp_b_info;
@@ -437,7 +468,7 @@
         }
     }
 
-    if(!run_optimised_requantized)
+    if (!run_optimised_requantized)
     {
         TensorInfo info_vector_sum_col{};
         TensorInfo info_vector_sum_row{};
@@ -445,62 +476,70 @@
         const GEMMLowpReductionKernelInfo reduction_info(a_to_use->dimension(0), false, 0, false);
 
         // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-        if(a_offset != 0)
+        if (a_offset != 0)
         {
             info_vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
             // Configure Matrix B reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                kernels::CpuGemmLowpMatrixBReductionKernel::validate(b, &info_vector_sum_col, reduction_info));
         }
 
         // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-        if(b_offset != 0)
+        if (b_offset != 0)
         {
             info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
             // Configure matrix A reduction kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                kernels::CpuGemmLowpMatrixAReductionKernel::validate(a_to_use, &info_vector_sum_row, reduction_info));
         }
 
-        if(fuse_output_stage)
+        if (fuse_output_stage)
         {
-            if(!run_optimised)
+            if (!run_optimised)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.reinterpret_input_as_3d(),
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.depth_output_gemm3d() != 0,
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
 
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info));
             }
 
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                          a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                          b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                          c,
-                                                                                                          flip_signedness ? &signed_output : output,
-                                                                                                          a_offset, b_offset,
-                                                                                                          info.gemmlowp_output_stage()));
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, flip_signedness ? &signed_output : output, a_offset,
+                b_offset, info.gemmlowp_output_stage()));
         }
         else
         {
-            if(!run_optimised)
+            if (!run_optimised)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.reinterpret_input_as_3d(), "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.depth_output_gemm3d() != 0, "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.reinterpret_input_as_3d(),
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the input tensor as 3D");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    info.depth_output_gemm3d() != 0,
+                    "CpuGemmLowpMatrixMultiplyKernel cannot reinterpret the output tensor as 3D");
 
-                ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
+                ARM_COMPUTE_RETURN_ON_ERROR(
+                    kernels::CpuGemmLowpMatrixMultiplyKernel::validate(matrix_a_info, matrix_b_info, output));
             }
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(output,
-                                                                                               a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                               b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                               a_offset, b_offset));
+            ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuGemmLowpOffsetContributionKernel::validate(
+                output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+                a_offset, b_offset));
         }
     }
 
     // Validate activation
     const ActivationLayerInfo &activation = gemm_info.activation_info();
-    if(activation.enabled())
+    if (activation.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CpuActivation::validate(output, nullptr, activation));
     }
@@ -529,24 +568,22 @@
     CpuAuxTensorHandler signed_output(offset_int_vec(SignedOutput), _signed_output, tensors, false);
 
     // Convert QASYMM8->QASYMM8_SIGNED
-    if(_flip_signedness)
+    if (_flip_signedness)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, a },
-            { TensorType::ACL_DST, signed_a.get() }
-        };
-        NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(), pack);
+        ITensorPack pack = {{TensorType::ACL_SRC, a}, {TensorType::ACL_DST, signed_a.get()}};
+        NEScheduler::get().schedule_op(_convert_to_signed_asymm.get(), Window::DimY, _convert_to_signed_asymm->window(),
+                                       pack);
         a_to_use = signed_a.get();
         matrix_a = signed_a.get();
     }
 
     // Run GEMM
-    if(_asm_glue->is_configured())
+    if (_asm_glue->is_configured())
     {
         ITensorPack asm_glue_tensors = tensors;
         auto        output_to_use    = (_fuse_output_stage ? mm_result_s32.get() : dst);
-        if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) &&
+            _gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
             asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
@@ -563,35 +600,25 @@
     }
     else
     {
-        if(!_run_vector_matrix_multiplication)
+        if (!_run_vector_matrix_multiplication)
         {
             matrix_a = tmp_a.get();
             matrix_b = tmp_b.get();
             // Run interleave kernel
-            ITensorPack pack_a =
-            {
-                { TensorType::ACL_SRC, a_to_use },
-                { TensorType::ACL_DST, tmp_a.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(), pack_a);
+            ITensorPack pack_a = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, tmp_a.get()}};
+            NEScheduler::get().schedule_op(_mtx_a_reshape_kernel.get(), Window::DimY, _mtx_a_reshape_kernel->window(),
+                                           pack_a);
 
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
-                ITensorPack pack_b =
-                {
-                    { TensorType::ACL_SRC, b },
-                    { TensorType::ACL_DST, tmp_b.get() }
-                };
+                ITensorPack pack_b = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, tmp_b.get()}};
                 // Run transpose kernel
-                NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack_b);
+                NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY,
+                                               _mtx_b_reshape_kernel->window(), pack_b);
             }
         }
-        ITensorPack pack_mm =
-        {
-            { TensorType::ACL_SRC_0, matrix_a },
-            { TensorType::ACL_SRC_1, matrix_b }
-        };
-        if(_fuse_output_stage)
+        ITensorPack pack_mm = {{TensorType::ACL_SRC_0, matrix_a}, {TensorType::ACL_SRC_1, matrix_b}};
+        if (_fuse_output_stage)
         {
             pack_mm.add_tensor(TensorType::ACL_DST, mm_result_s32.get());
         }
@@ -602,31 +629,25 @@
         NEScheduler::get().schedule_op(_mm_kernel.get(), Window::DimY, _mm_kernel->window(), pack_mm);
     }
 
-    if(!_fused_assembly_path)
+    if (!_fused_assembly_path)
     {
         // Run matrix A reduction kernel only if _b_offset is not equal to 0
-        if(_b_offset != 0)
+        if (_b_offset != 0)
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, a_to_use },
-                { TensorType::ACL_DST, vector_sum_row.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX, _mtx_a_reduction_kernel->window(), pack);
+            ITensorPack pack = {{TensorType::ACL_SRC, a_to_use}, {TensorType::ACL_DST, vector_sum_row.get()}};
+            NEScheduler::get().schedule_op(_mtx_a_reduction_kernel.get(), Window::DimX,
+                                           _mtx_a_reduction_kernel->window(), pack);
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+        if (_a_offset != 0 && !_reshape_b_only_on_first_run)
         {
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, b },
-                { TensorType::ACL_DST, vector_sum_col.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+            ITensorPack pack = {{TensorType::ACL_SRC, b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+                                           _mtx_b_reduction_kernel->window(), pack);
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             ITensorPack pack;
             pack.add_tensor(TensorType::ACL_SRC_0, mm_result_s32.get());
@@ -636,7 +657,8 @@
             pack.add_tensor(TensorType::ACL_DST, _flip_signedness ? signed_output.get() : dst);
 
             // Run offset contribution kernel
-            NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY, _offset_contribution_output_stage_kernel->window(), pack);
+            NEScheduler::get().schedule_op(_offset_contribution_output_stage_kernel.get(), Window::DimY,
+                                           _offset_contribution_output_stage_kernel->window(), pack);
         }
         else
         {
@@ -646,68 +668,57 @@
             pack.add_tensor(TensorType::ACL_DST, dst);
 
             // Run offset contribution kernel
-            NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY, _offset_contribution_kernel->window(), pack);
+            NEScheduler::get().schedule_op(_offset_contribution_kernel.get(), Window::DimY,
+                                           _offset_contribution_kernel->window(), pack);
         }
     }
 
     // Convert QASYMM8_SIGNED->QASYMM8
-    if(!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
+    if (!_fused_assembly_path && _fuse_output_stage && _flip_signedness)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, signed_output.get() },
-            { TensorType::ACL_DST, dst }
-        };
-        NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY, _convert_from_signed_asymm->window(), pack);
+        ITensorPack pack = {{TensorType::ACL_SRC, signed_output.get()}, {TensorType::ACL_DST, dst}};
+        NEScheduler::get().schedule_op(_convert_from_signed_asymm.get(), Window::DimY,
+                                       _convert_from_signed_asymm->window(), pack);
     }
 
     // Run fused activation unless already run in the fused assembly
-    if(_run_activation)
+    if (_run_activation)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, dst },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
         _activation_func->run(pack);
     }
 }
 
 void CpuGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto original_b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         // Run assembly reshape
-        if(_asm_glue->is_configured())
+        if (_asm_glue->is_configured())
         {
             _asm_glue->prepare(tensors);
         }
         // Run non-assembly reshape
-        else if(_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
+        else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication && !_asm_glue->is_configured())
         {
             // Run reshape kernel and mark original weights tensor as unused
-            ITensor            *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
+            ITensor *tmp_b_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(TmpB)));
             CpuAuxTensorHandler tmp_b(_tmp_b, *tmp_b_p);
-            ITensorPack         pack =
-            {
-                { TensorType::ACL_SRC, original_b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(), pack);
+            ITensorPack         pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, tmp_b.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reshape_kernel.get(), Window::DimY, _mtx_b_reshape_kernel->window(),
+                                           pack);
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
+        if (!_fused_assembly_path && _a_offset != 0 && _reshape_b_only_on_first_run)
         {
-            ITensor            *vector_sum_col_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
+            ITensor *vector_sum_col_p =
+                utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(VectorSumCol)));
             CpuAuxTensorHandler vector_sum_col(_vector_sum_col, *vector_sum_col_p);
-            ITensorPack         pack =
-            {
-                { TensorType::ACL_SRC, original_b },
-                { TensorType::ACL_DST, vector_sum_col.get() }
-            };
-            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX, _mtx_b_reduction_kernel->window(), pack);
+            ITensorPack         pack = {{TensorType::ACL_SRC, original_b}, {TensorType::ACL_DST, vector_sum_col.get()}};
+            NEScheduler::get().schedule_op(_mtx_b_reduction_kernel.get(), Window::DimX,
+                                           _mtx_b_reduction_kernel->window(), pack);
         }
         _is_prepared = true;
     }

diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
index a1b3429..a779893 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/function_info/GEMMInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -108,18 +109,26 @@
      * @param[in]  gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *dst,
+                   const GEMMInfo    &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuGemmLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *dst, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *dst,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.cpp b/src/cpu/operators/CpuGemmLowpOutputStage.cpp
index 58f98ac..4215eed 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.cpp
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ScaleKernel.h"
 #include "src/cpu/kernels/CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel.h"
@@ -36,36 +37,42 @@
 {
 namespace cpu
 {
-void CpuGemmLowpOutputStage::configure(ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+void CpuGemmLowpOutputStage::configure(ITensorInfo                   *src,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CpuGemmLowpOutputStage::validate(src, bias, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
-            switch(info.output_data_type)
+            switch (info.output_data_type)
             {
                 case DataType::QASYMM8:
                 {
                     auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+                                 info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_offset,
+                                 info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
                 case DataType::QSYMM16:
                 {
                     auto k = std::make_unique<kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel>();
-                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    k->configure(src, bias, dst, info.gemmlowp_multiplier, info.gemmlowp_shift, info.gemmlowp_min_bound,
+                                 info.gemmlowp_max_bound);
                     _kernel = std::move(k);
                     break;
                 }
@@ -79,7 +86,7 @@
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
         {
-            switch(info.output_data_type)
+            switch (info.output_data_type)
             {
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
@@ -102,32 +109,41 @@
     }
 }
 
-Status CpuGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+Status CpuGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                        const ITensorInfo             *bias,
+                                        const ITensorInfo             *dst,
+                                        const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN, "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
-    ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) && (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::UNKNOWN,
+                                    "CpuGemmLowpOutputStage cannot be used with UNKNOWN output data type.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON((info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN) &&
+                                (info.type != GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT));
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
-            switch(dst->data_type())
+            switch (dst->data_type())
             {
                 case DataType::QASYMM8:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                 case DataType::QASYMM8_SIGNED:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt8ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                 case DataType::QSYMM16:
-                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
+                    return kernels::CpuGemmLowpQuantizeDownInt32ToInt16ScaleByFixedPointKernel::validate(
+                        src, bias, dst, info.gemmlowp_min_bound, info.gemmlowp_max_bound);
                 default:
                     return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported output data type.");
             }
         }
         case GEMMLowpOutputStageType::QUANTIZE_DOWN:
         {
-            switch(dst->data_type())
+            switch (dst->data_type())
             {
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
@@ -146,4 +162,4 @@
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/operators/CpuGemmLowpOutputStage.h b/src/cpu/operators/CpuGemmLowpOutputStage.h
index 39394f6..e5e2f41 100644
--- a/src/cpu/operators/CpuGemmLowpOutputStage.h
+++ b/src/cpu/operators/CpuGemmLowpOutputStage.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_GEMMLOWP_OUTPUT_STAGE_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 /** This file contains all available output stages for GEMMLowp.
@@ -76,7 +77,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;

diff --git a/src/cpu/operators/CpuMatMul.cpp b/src/cpu/operators/CpuMatMul.cpp
index 8811a7e..8908712 100644
--- a/src/cpu/operators/CpuMatMul.cpp
+++ b/src/cpu/operators/CpuMatMul.cpp

@@ -23,14 +23,16 @@
  */
 
 #include "src/cpu/operators/CpuMatMul.h"
-#include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
+
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/MatMulInfo.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEMatMul.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,8 +48,11 @@
 {
 namespace
 {
-Status get_gemmlowp_output_stage_info(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act,
-                                      GEMMLowpOutputStageInfo &gemmlowp_output_stage_info)
+Status get_gemmlowp_output_stage_info(const ITensorInfo         *src,
+                                      const ITensorInfo         *weights,
+                                      const ITensorInfo         *dst,
+                                      const ActivationLayerInfo &act,
+                                      GEMMLowpOutputStageInfo   &gemmlowp_output_stage_info)
 {
     const auto                    data_type = src->data_type();
     const QuantizationInfo        oq_info   = dst->quantization_info();
@@ -59,10 +64,11 @@
     int32_t output_multiplier;
     int32_t output_shift;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
-    int32_t type_min = 0;
-    int32_t type_max = 0;
+    int32_t type_min             = 0;
+    int32_t type_max             = 0;
     std::tie(type_min, type_max) = quantization::get_quantized_asymmetric_output_min_max(oq_info, act, data_type);
 
     gemmlowp_output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -77,14 +83,27 @@
 } // namespace
 
 CpuMatMul::CpuMatMul()
-    : _transpose_kernel_lhs(), _transpose_kernel_rhs(), _asm_glue(), _lhs_transposed(), _rhs_transposed(), _original_lhs_shape(), _original_rhs_shape(), _original_dst_shape()
+    : _transpose_kernel_lhs(),
+      _transpose_kernel_rhs(),
+      _asm_glue(),
+      _lhs_transposed(),
+      _rhs_transposed(),
+      _original_lhs_shape(),
+      _original_rhs_shape(),
+      _original_dst_shape()
 {
 }
 
-Status CpuMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+Status CpuMatMul::validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
+                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->are_values_constant(), "LHS Tensor must be dynamic.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs->are_values_constant(), "RHS Tensor must be dynamic.");
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(lhs);
@@ -103,34 +122,39 @@
     gemm_info.fast_mode       = settings.fast_math();
 
     // Validate and then permute a/b
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        auto_init_if_empty(lhs_transposed, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
+        auto_init_if_empty(lhs_transposed,
+                           lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*lhs)));
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(lhs_to_use, &lhs_transposed));
         // Assign lhs_to_use pointer to use transposed TensorInfo
         lhs_to_use = &lhs_transposed;
     }
-    if(adj_rhs)
+    if (adj_rhs)
     {
-        auto_init_if_empty(rhs_transposed, rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
+        auto_init_if_empty(rhs_transposed,
+                           rhs->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*rhs)));
         ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuTransposeKernel::validate(rhs_to_use, &rhs_transposed));
         // Assign rhs_to_use pointer to use transposed TensorInfo
         rhs_to_use = &rhs_transposed;
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(0) != rhs_to_use->dimension(1),
-                                    "The product AB is defined only if the number of columns in A is equal to the number of rows in B (after transpose)");
+                                    "The product AB is defined only if the number of columns in A is equal to the "
+                                    "number of rows in B (after transpose)");
 
     // Iterate over dimensions to be collapsed in operator - check dimensions are equivalent between tensors
-    for(unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
+    for (unsigned int i = 2; i < Coordinates::num_max_dimensions; i++)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i), "Broadcasting in Batch dimension is unsupported by this operator.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_to_use->dimension(i) != rhs_to_use->dimension(i),
+                                        "Broadcasting in Batch dimension is unsupported by this operator.");
     }
 
     // Quantized-specific configuration
-    if(is_data_type_quantized(lhs->data_type()))
+    if (is_data_type_quantized(lhs->data_type()))
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst, gemm_info.activation_info, gemm_info.output_stage));
+        ARM_COMPUTE_RETURN_ON_ERROR(get_gemmlowp_output_stage_info(lhs_to_use, rhs_to_use, dst,
+                                                                   gemm_info.activation_info, gemm_info.output_stage));
     }
 
     cpu::CpuGemmAssemblyDispatch::validate(lhs_to_use, rhs_to_use, nullptr, dst, gemm_info);
@@ -138,7 +162,12 @@
     return Status{};
 }
 
-void CpuMatMul::configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CpuMatMul::configure(ITensorInfo               *lhs,
+                          ITensorInfo               *rhs,
+                          ITensorInfo               *dst,
+                          const MatMulInfo          &info,
+                          const CpuMatMulSettings   &settings,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, info, settings);
@@ -163,21 +192,23 @@
     _original_rhs_shape = rhs_to_use.tensor_shape();
 
     // Reshape lhs for use with assembly kernels.
-    lhs_to_use.set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
-    dst_to_use.set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
+    lhs_to_use.set_tensor_shape(
+        TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z()));
+    dst_to_use.set_tensor_shape(
+        TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z()));
     rhs_to_use.set_tensor_shape(_original_rhs_shape.collapsed_from(2));
 
     // 2.  Configuration for transpose of lhs/rhs
     // ------------------------------------------------------
     // Initialise transposed TensorInfo class for aux tensors (intermediary tensors)
-    if(_adj_lhs)
+    if (_adj_lhs)
     {
         // Setup transpose LHS
         _transpose_kernel_lhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
         _transpose_kernel_lhs->configure(&lhs_to_use, &_lhs_transposed);
     }
 
-    if(_adj_rhs)
+    if (_adj_rhs)
     {
         // Setup transpose RHS
         _transpose_kernel_rhs = std::make_unique<cpu::kernels::CpuTransposeKernel>();
@@ -196,20 +227,22 @@
     rhs_to_use = (_adj_rhs) ? _rhs_transposed : rhs_to_use;
 
     // Quantized-specific configuration
-    if(is_data_type_quantized(lhs->data_type()))
+    if (is_data_type_quantized(lhs->data_type()))
     {
-        get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info, _gemm_info.output_stage);
+        get_gemmlowp_output_stage_info(&lhs_to_use, &rhs_to_use, &dst_to_use, _gemm_info.activation_info,
+                                       _gemm_info.output_stage);
     }
 
     // Configure Asm Kernel
     _asm_glue = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
-    _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use, _gemm_info); // c is nullptr as bias not supported in MatMul
+    _asm_glue->configure(&lhs_to_use, &rhs_to_use, nullptr, &dst_to_use,
+                         _gemm_info); // c is nullptr as bias not supported in MatMul
 
     // Specify memory requirements for intermediate tensors
     auto asm_mem_req = _asm_glue->workspace();
     // Specify memory required by gemm kernel
     int idx = 0;
-    for(const auto &aux : asm_mem_req)
+    for (const auto &aux : asm_mem_req)
     {
         _aux_mem[idx] = aux;
         idx++;
@@ -228,8 +261,12 @@
 
     // Reshape LHS and DST to ensure compatibility with GEMM asm kernel (Batch dimensions is 4th for lhs and dst within asm)
     // Collapse RHS (necessary to support dimensions larger than 3 in gemm assembly)
-    lhs->info()->set_tensor_shape(TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1, _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
-    dst->info()->set_tensor_shape(TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1, _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+    lhs->info()->set_tensor_shape(
+        TensorShape(_original_lhs_shape.x(), _original_lhs_shape.y(), 1,
+                    _original_lhs_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
+    dst->info()->set_tensor_shape(
+        TensorShape(_original_dst_shape.x(), _original_dst_shape.y(), 1,
+                    _original_dst_shape.collapsed_from(2).z())); // Collapsed 3+ dimensions into z
     rhs->info()->set_tensor_shape(_original_rhs_shape.collapsed_from(2));
 
     // Initialise object to handle stored transposed tensors in auxillary memory
@@ -240,17 +277,19 @@
     ITensorPack asm_tensors(tensors);
 
     // Run transpose lhs if necessary
-    if(_adj_lhs)
+    if (_adj_lhs)
     {
-        ITensorPack lhs_transpose_pack = { { TensorType::ACL_SRC, lhs }, { TensorType::ACL_DST, lhs_transposed.get() } };
-        NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(), lhs_transpose_pack);
+        ITensorPack lhs_transpose_pack = {{TensorType::ACL_SRC, lhs}, {TensorType::ACL_DST, lhs_transposed.get()}};
+        NEScheduler::get().schedule_op(_transpose_kernel_lhs.get(), Window::DimY, _transpose_kernel_lhs->window(),
+                                       lhs_transpose_pack);
         asm_tensors.add_const_tensor(TensorType::ACL_SRC_0, lhs_transposed.get());
     }
     // Run transpose rhs if necessary
-    if(_adj_rhs)
+    if (_adj_rhs)
     {
-        ITensorPack rhs_transpose_pack = { { TensorType::ACL_SRC, rhs }, { TensorType::ACL_DST, rhs_transposed.get() } };
-        NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(), rhs_transpose_pack);
+        ITensorPack rhs_transpose_pack = {{TensorType::ACL_SRC, rhs}, {TensorType::ACL_DST, rhs_transposed.get()}};
+        NEScheduler::get().schedule_op(_transpose_kernel_rhs.get(), Window::DimY, _transpose_kernel_rhs->window(),
+                                       rhs_transpose_pack);
         asm_tensors.add_const_tensor(TensorType::ACL_SRC_1, rhs_transposed.get());
     }
     // Run asm kernel

diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 475c019..24db3da 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_CPU_OPERATORS_CPUMATMUL
 
 #include "arm_compute/core/TensorInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/kernels/CpuTransposeKernel.h"
@@ -66,18 +67,27 @@
      * @param[in]  settings The settings for matmul operation (i.e fast math)
      * @param[in]  act_info Class containing information about fused activation function.
      */
-    void configure(ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &info,
+                   const CpuMatMulSettings   &settings,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuMatMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &info, const CpuMatMulSettings &settings,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &info,
+                           const CpuMatMulSettings   &settings,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -91,9 +101,9 @@
     };
 
     // Define unique pointers to kernels/operators used by matmul
-    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{ nullptr };
-    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{ nullptr };
-    std::unique_ptr<CpuGemmAssemblyDispatch>     _asm_glue{ nullptr };
+    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_lhs{nullptr};
+    std::unique_ptr<kernels::CpuTransposeKernel> _transpose_kernel_rhs{nullptr};
+    std::unique_ptr<CpuGemmAssemblyDispatch>     _asm_glue{nullptr};
 
     // TensorInfo for tensors stored in auxillary memory
     TensorInfo _lhs_transposed{};
@@ -105,13 +115,13 @@
     TensorShape _original_dst_shape{};
 
     // Note : adj_lhs means the same as transposing lhs
-    bool                             _adj_lhs{ false };
-    bool                             _adj_rhs{ false };
-    bool                             _fast_math{ false };
+    bool                             _adj_lhs{false};
+    bool                             _adj_rhs{false};
+    bool                             _fast_math{false};
     AsmGemmInfo                      _gemm_info{};
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
 };
-}
-}
+} // namespace cpu
+} // namespace arm_compute
 
 #endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */

diff --git a/src/cpu/operators/CpuMaxUnpooling.cpp b/src/cpu/operators/CpuMaxUnpooling.cpp
index 24e9fd6..697fc40 100644
--- a/src/cpu/operators/CpuMaxUnpooling.cpp
+++ b/src/cpu/operators/CpuMaxUnpooling.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuMaxUnpooling.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 
@@ -29,7 +30,10 @@
 {
 namespace cpu
 {
-void CpuMaxUnpooling::configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+void CpuMaxUnpooling::configure(const ITensorInfo      *src,
+                                const ITensorInfo      *indices,
+                                ITensorInfo            *dst,
+                                const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, indices, dst, pool_info);
     auto k = std::make_unique<kernels::CpuMaxUnpoolingLayerKernel>();
@@ -37,9 +41,12 @@
     _kernel = std::move(k);
 }
 
-Status CpuMaxUnpooling::validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info)
+Status CpuMaxUnpooling::validate(const ITensorInfo      *src,
+                                 const ITensorInfo      *indices,
+                                 const ITensorInfo      *dst,
+                                 const PoolingLayerInfo &pool_info)
 {
     return kernels::CpuMaxUnpoolingLayerKernel::validate(src, indices, dst, pool_info);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuMaxUnpooling.h b/src/cpu/operators/CpuMaxUnpooling.h
index aa1f107..5dc00bc 100644
--- a/src/cpu/operators/CpuMaxUnpooling.h
+++ b/src/cpu/operators/CpuMaxUnpooling.h

@@ -44,14 +44,18 @@
      * @param[out] dst       Destination tensor. Data types supported: Same as @p src
      * @param[in]  pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    void
+    configure(const ITensorInfo *src, const ITensorInfo *indices, ITensorInfo *dst, const PoolingLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuMaxUnpooling::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *indices, const ITensorInfo *dst, const PoolingLayerInfo &pool_info);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *indices,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info);
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuMul.cpp b/src/cpu/operators/CpuMul.cpp
index 4c15015..ac98471 100644
--- a/src/cpu/operators/CpuMul.cpp
+++ b/src/cpu/operators/CpuMul.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuMulKernel.h"
 
@@ -33,14 +34,24 @@
 {
 namespace cpu
 {
-Status CpuMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status CpuMul::validate(const ITensorInfo         *src1,
+                        const ITensorInfo         *src2,
+                        const ITensorInfo         *dst,
+                        float                      scale,
+                        ConvertPolicy              overflow_policy,
+                        RoundingPolicy             rounding_policy,
                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy);
 }
 
-void CpuMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void CpuMul::configure(ITensorInfo               *src1,
+                       ITensorInfo               *src2,
+                       ITensorInfo               *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
                        const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
@@ -58,13 +69,19 @@
     NEScheduler::get().schedule_op(_kernel.get(), split_dimension, _kernel->window(), tensors);
 }
 
-Status CpuComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status CpuComplexMul::validate(const ITensorInfo         *src1,
+                               const ITensorInfo         *src2,
+                               const ITensorInfo         *dst,
+                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuComplexMulKernel::validate(src1, src2, dst);
 }
 
-void CpuComplexMul::configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void CpuComplexMul::configure(ITensorInfo               *src1,
+                              ITensorInfo               *src2,
+                              ITensorInfo               *dst,
+                              const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
@@ -80,4 +97,4 @@
     NEScheduler::get().schedule_op(_kernel.get(), Window::DimY, _kernel->window(), tensors);
 }
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/operators/CpuMul.h b/src/cpu/operators/CpuMul.h
index 3e0edbf..82b3098 100644
--- a/src/cpu/operators/CpuMul.h
+++ b/src/cpu/operators/CpuMul.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -61,7 +62,12 @@
      * @param[in]      rounding_policy Rounding policy.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    void configure(ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -69,7 +75,12 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
@@ -89,14 +100,20 @@
      * @param[out]     dst      The dst tensor. Data types supported: same as @p src1. Number of channels: same as @p src1.
      * @param[in]      act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuComplexMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;

diff --git a/src/cpu/operators/CpuPermute.cpp b/src/cpu/operators/CpuPermute.cpp
index babaf21..25acc92 100644
--- a/src/cpu/operators/CpuPermute.cpp
+++ b/src/cpu/operators/CpuPermute.cpp

@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuPermute.h"
 
-#include "src/cpu/kernels/CpuPermuteKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuPermuteKernel.h"
 
 namespace arm_compute
 {
@@ -43,5 +42,5 @@
 {
     return kernels::CpuPermuteKernel::validate(src, dst, perm);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuPool2d.cpp b/src/cpu/operators/CpuPool2d.cpp
index 722cd36..b72bde6 100644
--- a/src/cpu/operators/CpuPool2d.cpp
+++ b/src/cpu/operators/CpuPool2d.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuPool2dKernel.h"
 #include "src/cpu/kernels/internal/CpuPool2dAssemblyWrapperKernel.h"
@@ -53,7 +54,8 @@
     ARM_COMPUTE_LOG_PARAMS(src, dst, pool_info, indices);
 
     // Check if we can run assembly kernels. Currently, indices are not supported by those kernels
-    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+    const bool run_optimised =
+        bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
 
     // Get data layout
     _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
@@ -61,10 +63,11 @@
     // Check if we have Global Pooling Layer
     const unsigned int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const unsigned int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) && (src->dimension(idx_height) == pool_info.pool_size.height);
-    _use_kernel_indices           = pool_info.use_kernel_indices;
+    _is_global_pooling_layer      = (src->dimension(idx_width) == pool_info.pool_size.width) &&
+                               (src->dimension(idx_height) == pool_info.pool_size.height);
+    _use_kernel_indices = pool_info.use_kernel_indices;
 
-    if(run_optimised)
+    if (run_optimised)
     {
         const CPUInfo     &ci          = NEScheduler::get().cpu_info();
         const unsigned int num_threads = NEScheduler::get().num_threads();
@@ -76,7 +79,7 @@
         // Get kernel's memory requirements
         constexpr size_t alignment      = 4096;
         const size_t     workspace_size = pooling_wrapper->get_working_size(num_threads);
-        _aux_mem[0]                     = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
+        _aux_mem[0] = MemoryInfo(TensorType::ACL_INT_0, MemoryLifetime::Temporary, workspace_size, alignment);
 
         _asm_glue = std::move(pooling_wrapper);
     }
@@ -89,11 +92,15 @@
     }
 }
 
-Status CpuPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CpuPool2d::validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices)
 {
-    const bool run_optimised = bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
+    const bool run_optimised =
+        bool(kernels::CpuPool2dAssemblyWrapperKernel::validate(src, dst, pool_info)) && (indices == nullptr);
 
-    if(run_optimised)
+    if (run_optimised)
     {
         return Status{};
     }
@@ -105,20 +112,24 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No tensors provided");
 
-    if(_asm_glue)
+    if (_asm_glue)
     {
         const auto hints = (_is_global_pooling_layer) ? Window::DimX : Window::DimY;
         NEScheduler::get().schedule_op(_asm_glue.get(), hints, _asm_glue->window(), tensors);
     }
     else
     {
-        switch(_data_layout)
+        switch (_data_layout)
         {
             case DataLayout::NCHW:
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), _is_global_pooling_layer ? Window::DimZ : Window::DimY, _pooling_layer_kernel->window(), tensors);
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+                                               _is_global_pooling_layer ? Window::DimZ : Window::DimY,
+                                               _pooling_layer_kernel->window(), tensors);
                 break;
             case DataLayout::NHWC:
-                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(), (_use_kernel_indices ? Window::DimY : Window::DimX), _pooling_layer_kernel->window(), tensors);
+                NEScheduler::get().schedule_op(_pooling_layer_kernel.get(),
+                                               (_use_kernel_indices ? Window::DimY : Window::DimX),
+                                               _pooling_layer_kernel->window(), tensors);
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data layout not supported");

diff --git a/src/cpu/operators/CpuPool2d.h b/src/cpu/operators/CpuPool2d.h
index 5c571db..ea73e3f 100644
--- a/src/cpu/operators/CpuPool2d.h
+++ b/src/cpu/operators/CpuPool2d.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL2D_H
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -58,17 +59,21 @@
      * @param[in]      pool_info Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out]     indices   (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void
+    configure(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to CpuPool2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/cpu/operators/CpuPool3d.cpp b/src/cpu/operators/CpuPool3d.cpp
index 14e4ac6..7fa78c1 100644
--- a/src/cpu/operators/CpuPool3d.cpp
+++ b/src/cpu/operators/CpuPool3d.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/Scheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuPool3dKernel.h"
 
@@ -35,8 +36,7 @@
 {
 namespace cpu
 {
-CpuPool3d::CpuPool3d()
-    : _aux_mem(1)
+CpuPool3d::CpuPool3d() : _aux_mem(1)
 {
 }
 
@@ -70,4 +70,4 @@
 }
 
 } // namespace cpu
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/cpu/operators/CpuPool3d.h b/src/cpu/operators/CpuPool3d.h
index 8a73f8a..235d798 100644
--- a/src/cpu/operators/CpuPool3d.h
+++ b/src/cpu/operators/CpuPool3d.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_POOL3D_H
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -61,7 +62,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/cpu/operators/CpuQuantize.cpp b/src/cpu/operators/CpuQuantize.cpp
index f9e14d1..4315499 100644
--- a/src/cpu/operators/CpuQuantize.cpp
+++ b/src/cpu/operators/CpuQuantize.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuQuantizeKernel.h"
 

diff --git a/src/cpu/operators/CpuReshape.cpp b/src/cpu/operators/CpuReshape.cpp
index e6892a2..a423abb 100644
--- a/src/cpu/operators/CpuReshape.cpp
+++ b/src/cpu/operators/CpuReshape.cpp

@@ -23,11 +23,10 @@
  */
 #include "src/cpu/operators/CpuReshape.h"
 
-#include "src/cpu/kernels/CpuReshapeKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuReshapeKernel.h"
 
 namespace arm_compute
 {
@@ -49,7 +48,7 @@
 void CpuReshape::run(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided");
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         static_cast<kernels::CpuReshapeKernel *>(_kernel.get())->prepare(tensors);
         _is_prepared = true;

diff --git a/src/cpu/operators/CpuReshape.h b/src/cpu/operators/CpuReshape.h
index 9bc43e7..33da792 100644
--- a/src/cpu/operators/CpuReshape.h
+++ b/src/cpu/operators/CpuReshape.h

@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_CPU_RESHAPE_H
 #define ARM_COMPUTE_CPU_RESHAPE_H
 
-#include "src/cpu/ICpuOperator.h"
 #include "arm_compute/core/Window.h"
 
+#include "src/cpu/ICpuOperator.h"
+
 namespace arm_compute
 {
 namespace cpu
@@ -53,7 +54,7 @@
     void run(ITensorPack &tensors) override;
 
 private:
-    bool    _is_prepared{ false } ;
+    bool _is_prepared{false};
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuScale.cpp b/src/cpu/operators/CpuScale.cpp
index 8a712bf..7df9296 100644
--- a/src/cpu/operators/CpuScale.cpp
+++ b/src/cpu/operators/CpuScale.cpp

@@ -24,8 +24,9 @@
 #include "src/cpu/operators/CpuScale.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/cpu/kernels/CpuScaleKernel.h"
@@ -37,11 +38,12 @@
 {
 namespace
 {
-void precompute_dx_dy_offsets(ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
+void precompute_dx_dy_offsets(
+    ITensor *dx, ITensor *dy, ITensor *offsets, float wr, float hr, SamplingPolicy sampling_policy, bool align_corners)
 {
     ARM_COMPUTE_ERROR_ON(offsets == nullptr);
     float sampling_offset = 0.0f;
-    if(sampling_policy == SamplingPolicy::CENTER)
+    if (sampling_policy == SamplingPolicy::CENTER)
     {
         sampling_offset = 0.5f;
     }
@@ -50,38 +52,44 @@
     win.set(Window::DimX, Window::Dimension(0, offsets->info()->dimension(0), 1));
     win.set(Window::DimY, Window::Dimension(0, offsets->info()->dimension(1), 1));
 
-    if(dx != nullptr && dy != nullptr)
+    if (dx != nullptr && dy != nullptr)
     {
         // Pre-compute the offset and pixel's distance for BILINEAR interpolation
         Iterator offsets_it(offsets, win);
         Iterator dx_it(dx, win);
         Iterator dy_it(dy, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
-            const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
-            const int   in_xi = std::floor(in_x);
-            const int   in_yi = std::floor(in_y);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &id)
+            {
+                const float in_x  = (id.x() + sampling_offset) * wr - sampling_offset;
+                const float in_y  = (id.y() + sampling_offset) * hr - sampling_offset;
+                const int   in_xi = std::floor(in_x);
+                const int   in_yi = std::floor(in_y);
 
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
-            *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
-            *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
-        },
-        offsets_it, dx_it, dy_it);
+                *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+                *reinterpret_cast<float *>(dx_it.ptr())        = in_x - in_xi;
+                *reinterpret_cast<float *>(dy_it.ptr())        = in_y - in_yi;
+            },
+            offsets_it, dx_it, dy_it);
     }
     else
     {
         // Pre-compute the offset for NEAREST interpolation
         Iterator offsets_it(offsets, win);
 
-        execute_window_loop(win, [&](const Coordinates & id)
-        {
-            const float float_in_xi                        = (id.x() + sampling_offset) * wr;
-            const auto  in_xi                              = static_cast<size_t>(align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi) : std::floor(float_in_xi));
-            *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
-        },
-        offsets_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &id)
+            {
+                const float float_in_xi = (id.x() + sampling_offset) * wr;
+                const auto  in_xi       = static_cast<size_t>(
+                    align_corners ? arm_compute::utils::rounding::round_half_away_from_zero(float_in_xi)
+                                         : std::floor(float_in_xi));
+                *reinterpret_cast<int32_t *>(offsets_it.ptr()) = in_xi;
+            },
+            offsets_it);
     }
 }
 } // namespace
@@ -96,20 +104,24 @@
     _is_prepared = false;
 
     // Get data layout and width/height indices
-    _data_layout         = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
-    const int idx_width  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    _data_layout        = _scale_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : _scale_info.data_layout;
+    const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+    const bool is_align_corners_used =
+        _scale_info.align_corners &&
+        arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+                                                                     dst->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+                                                                     dst->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
-                                         && hr <= 1.f) ?
-                                        InterpolationPolicy::NEAREST_NEIGHBOR :
-                                        _scale_info.interpolation_policy;
+    InterpolationPolicy policy_to_use =
+        (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : _scale_info.interpolation_policy;
 
     // Get the tensor shape
     TensorShape shape(dst->dimension(idx_width));
@@ -122,7 +134,7 @@
     auto dy           = std::make_unique<TensorInfo>(tensor_info_dxdy);
     auto offsets      = std::make_unique<TensorInfo>(tensor_info_offsets);
     auto scale_kernel = std::make_unique<kernels::CpuScaleKernel>();
-    switch(policy_to_use)
+    switch (policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
         {
@@ -148,7 +160,8 @@
 Status CpuScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER && info.sampling_policy != SamplingPolicy::TOP_LEFT);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.sampling_policy != SamplingPolicy::CENTER &&
+                                info.sampling_policy != SamplingPolicy::TOP_LEFT);
 
     ITensorInfo *offsets = nullptr;
     ITensorInfo *dx      = nullptr;
@@ -160,19 +173,25 @@
     const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width), dst->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height), dst->dimension(idx_height), is_align_corners_used);
+    const bool is_align_corners_used =
+        info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_width),
+                                                                     dst->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(src->dimension(idx_height),
+                                                                     dst->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+    InterpolationPolicy policy_to_use =
+        (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : info.interpolation_policy;
 
     // Get the tensor shape of auxilary buffers
     const TensorShape shape(dst->dimension(idx_width), dst->dimension(idx_height));
     TensorInfo        tensor_info_offsets(shape, Format::S32);
     TensorInfo        tensor_info_dx(shape, Format::F32);
     TensorInfo        tensor_info_dy(shape, Format::F32);
-    switch(policy_to_use)
+    switch (policy_to_use)
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
             offsets = &tensor_info_offsets;
@@ -186,13 +205,14 @@
             break;
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::CpuScaleKernel::validate(src->clone().get(), dx, dy, offsets, dst->clone().get(), info));
     return Status{};
 }
 
 void CpuScale::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _is_prepared       = true;
         const auto src     = tensors.get_const_tensor(TensorType::ACL_SRC);
@@ -206,22 +226,27 @@
         const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
 
         // Compute the ratio between source width/height and destination width/height
-        const bool is_align_corners_used = _scale_info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
-        const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
-        const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
+        const bool is_align_corners_used =
+            _scale_info.align_corners &&
+            arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(_scale_info.sampling_policy);
+        const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+            src->info()->dimension(idx_width), dst->info()->dimension(idx_width), is_align_corners_used);
+        const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+            src->info()->dimension(idx_height), dst->info()->dimension(idx_height), is_align_corners_used);
 
         // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-        InterpolationPolicy policy_to_use = (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f
-                                             && hr <= 1.f) ?
-                                            InterpolationPolicy::NEAREST_NEIGHBOR :
-                                            _scale_info.interpolation_policy;
+        InterpolationPolicy policy_to_use =
+            (_scale_info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+                ? InterpolationPolicy::NEAREST_NEIGHBOR
+                : _scale_info.interpolation_policy;
         const SamplingPolicy sampling_policy = _scale_info.sampling_policy;
 
-        bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(_data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
+        bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+            _data_layout, src->info()->data_type(), policy_to_use, _scale_info.border_mode);
 
-        if(precompute_indices_weights)
+        if (precompute_indices_weights)
         {
-            switch(policy_to_use)
+            switch (policy_to_use)
             {
                 case InterpolationPolicy::NEAREST_NEIGHBOR:
                 {
@@ -245,7 +270,8 @@
         }
         else
         {
-            if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
+            if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR &&
+                policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
             {
                 ARM_COMPUTE_ERROR("Unsupported interpolation mode");
             }

diff --git a/src/cpu/operators/CpuScale.h b/src/cpu/operators/CpuScale.h
index ee7c523..c12a8e7 100644
--- a/src/cpu/operators/CpuScale.h
+++ b/src/cpu/operators/CpuScale.h

@@ -24,9 +24,10 @@
 #ifndef ARM_COMPUTE_CPU_SCALE_H
 #define ARM_COMPUTE_CPU_SCALE_H
 
+#include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/experimental/Types.h"
+
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -62,9 +63,9 @@
     void run(ITensorPack &tensors) override;
 
 private:
-    ScaleKernelInfo _scale_info{ InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED };
-    DataLayout      _data_layout{ DataLayout::UNKNOWN };
-    bool            _is_prepared{ false };
+    ScaleKernelInfo _scale_info{InterpolationPolicy::NEAREST_NEIGHBOR, BorderMode::UNDEFINED};
+    DataLayout      _data_layout{DataLayout::UNKNOWN};
+    bool            _is_prepared{false};
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuSoftmax.cpp b/src/cpu/operators/CpuSoftmax.cpp
index bf4c2fa..e55d7f9 100644
--- a/src/cpu/operators/CpuSoftmax.cpp
+++ b/src/cpu/operators/CpuSoftmax.cpp

@@ -25,9 +25,10 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
@@ -63,13 +64,15 @@
     ARM_COMPUTE_ERROR_THROW_ON(CpuSoftmaxGeneric::validate(src, dst, beta, axis));
     ARM_COMPUTE_LOG_PARAMS(src, dst, beta, axis);
 
-    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+    const unsigned int actual_axis =
+        static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
 
     _needs_permute = actual_axis > 0;
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
-        _permute_input.configure(src, &_input_permuted, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_input.configure(src, &_input_permuted,
+                                 softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
     }
 
     // We want to deal with a 2D input. Either it is the permuted version of the original input (4D case)
@@ -79,10 +82,11 @@
     // Create intermediate tensors shapes
     TensorShape max_sum_shape = tmp_input->tensor_shape();
     max_sum_shape.set(0, 1);
-    const TensorInfo input_info    = tmp_input->clone()->reset_padding().set_is_resizable(true);
-    DataType         tmp_data_type = is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
-    TensorInfo       tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
-    TensorInfo       max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
+    const TensorInfo input_info = tmp_input->clone()->reset_padding().set_is_resizable(true);
+    DataType         tmp_data_type =
+        is_data_type_quantized_asymmetric(tmp_input->data_type()) ? DataType::F32 : tmp_input->data_type();
+    TensorInfo tensor_info_tmp(input_info.clone()->set_data_type(tmp_data_type));
+    TensorInfo max_info(tmp_input->clone()->set_tensor_shape(max_sum_shape));
 
     // Init intermediate tensors
     _max = TensorInfo(max_info);
@@ -94,13 +98,14 @@
     _max_kernel = std::move(mk);
 
     auto sm = std::make_unique<kernels::CpuLogits1DSoftmaxKernel<IS_LOG>>();
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // The normalization kernel stores the result in a permuted output tensor
         sm->configure(tmp_input, &_max, &_output_permuted, beta, &_tmp);
 
         // Re-permute the permuted output into the requested (4D) output
-        _permute_output.configure(&_output_permuted, dst, softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
+        _permute_output.configure(&_output_permuted, dst,
+                                  softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis));
     }
     else
     {
@@ -109,11 +114,15 @@
     }
     _softmax_kernel = std::move(sm);
 
-    _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
-    _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
+    _aux_mem[InternalTensorIdx::MAX] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max.total_size());
+    _aux_mem[InternalTensorIdx::TMP] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp.total_size());
 
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _input_permuted.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _output_permuted.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
+                                                           MemoryLifetime::Temporary, _input_permuted.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST),
+                                                           MemoryLifetime::Temporary, _output_permuted.total_size());
 }
 
 template <bool IS_LOG>
@@ -123,7 +132,8 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
     ARM_COMPUTE_UNUSED(beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(axis < static_cast<int32_t>(-src->num_dimensions()) ||
+                                static_cast<int32_t>(src->num_dimensions()) <= axis);
 
     // Create intermediate tensor info
     DataType         tmp_data_type = src->data_type();
@@ -131,25 +141,33 @@
 
     TensorShape max_sum_shape = src->tensor_shape();
     max_sum_shape.set(0, 1);
-    const TensorInfo tensor_info_max_sum(src->clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(src->quantization_info()).set_is_resizable(true));
+    const TensorInfo tensor_info_max_sum(src->clone()
+                                             ->set_tensor_shape(max_sum_shape)
+                                             .set_data_type(tmp_data_type)
+                                             .set_quantization_info(src->quantization_info())
+                                             .set_is_resizable(true));
     const TensorInfo dont_care;
 
-    const unsigned int actual_axis = static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
+    const unsigned int actual_axis =
+        static_cast<unsigned int>(wrap_around(axis, static_cast<int32_t>(src->num_dimensions())));
 
     const bool needs_permute = actual_axis > 0;
 
-    if(needs_permute)
+    if (needs_permute)
     {
-        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
-        TensorInfo              input_permuted(src->clone()->set_tensor_shape(permuted_shape));
+        const PermutationVector permutation_vector =
+            softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape permuted_shape =
+            misc::shape_calculator::compute_permutation_output_shape(*src, permutation_vector);
+        TensorInfo input_permuted(src->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(src, &input_permuted, permutation_vector));
         TensorInfo output_permuted(dst->clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(CpuPermute::validate(&output_permuted, dst, permutation_vector));
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DMaxKernel::validate(src, &tensor_info_max_sum));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(&tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuLogits1DSoftmaxKernel<IS_LOG>::validate(
+        &tensor_info_tmp, &tensor_info_max_sum, dst, beta, &dont_care));
 
     return Status{};
 }
@@ -166,43 +184,38 @@
     CpuAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max, tensors, true);
 
     CpuAuxTensorHandler input_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _input_permuted, tensors, true);
-    CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors, true);
+    CpuAuxTensorHandler output_permuted(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _output_permuted, tensors,
+                                        true);
 
     ITensorPack max_pack;
     ITensorPack softmax_pack;
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
-        ITensorPack permute_in_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, input_permuted.get() } };
+        ITensorPack permute_in_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, input_permuted.get()}};
         _permute_input.run(permute_in_pack);
 
-        max_pack = { { TensorType::ACL_SRC, input_permuted.get() }, { TensorType::ACL_DST, max.get() } };
+        max_pack = {{TensorType::ACL_SRC, input_permuted.get()}, {TensorType::ACL_DST, max.get()}};
 
-        softmax_pack =
-        {
-            { TensorType::ACL_SRC_0, input_permuted.get() },
-            { TensorType::ACL_SRC_1, max.get() },
-            { TensorType::ACL_DST_0, output_permuted.get() },
-            { TensorType::ACL_DST_1, tmp.get() }
-        };
+        softmax_pack = {{TensorType::ACL_SRC_0, input_permuted.get()},
+                        {TensorType::ACL_SRC_1, max.get()},
+                        {TensorType::ACL_DST_0, output_permuted.get()},
+                        {TensorType::ACL_DST_1, tmp.get()}};
     }
     else
     {
-        max_pack = { { TensorType::ACL_SRC, src }, { TensorType::ACL_DST, max.get() } };
+        max_pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, max.get()}};
 
-        softmax_pack =
-        {
-            { TensorType::ACL_SRC_0, src },
-            { TensorType::ACL_SRC_1, max.get() },
-            { TensorType::ACL_DST_0, dst },
-            { TensorType::ACL_DST_1, tmp.get() }
-        };
+        softmax_pack = {{TensorType::ACL_SRC_0, src},
+                        {TensorType::ACL_SRC_1, max.get()},
+                        {TensorType::ACL_DST_0, dst},
+                        {TensorType::ACL_DST_1, tmp.get()}};
     }
 
     NEScheduler::get().schedule_op(_max_kernel.get(), Window::DimY, _max_kernel->window(), max_pack);
     NEScheduler::get().schedule_op(_softmax_kernel.get(), Window::DimY, _softmax_kernel->window(), softmax_pack);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         ITensorPack permute_out_pack;
         permute_out_pack.add_tensor(TensorType::ACL_SRC, output_permuted.get());
@@ -211,7 +224,7 @@
     }
 }
 
-template <bool                   IS_LOG>
+template <bool IS_LOG>
 experimental::MemoryRequirements CpuSoftmaxGeneric<IS_LOG>::workspace() const
 {
     return _aux_mem;

diff --git a/src/cpu/operators/CpuSoftmax.h b/src/cpu/operators/CpuSoftmax.h
index 64df870..8cab70e 100644
--- a/src/cpu/operators/CpuSoftmax.h
+++ b/src/cpu/operators/CpuSoftmax.h

@@ -24,11 +24,13 @@
 #ifndef ARM_COMPUTE_CPU_SOFTMAX_H
 #define ARM_COMPUTE_CPU_SOFTMAX_H
 
-#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/TensorInfo.h"
+
 #include "src/cpu/ICpuKernel.h"
 #include "src/cpu/ICpuOperator.h"
 #include "src/cpu/operators/CpuPermute.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -77,7 +79,7 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, float beta = 1.0f, int32_t axis = 0);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/cpu/operators/CpuSub.cpp b/src/cpu/operators/CpuSub.cpp
index 91a5b6e..7d27efb 100644
--- a/src/cpu/operators/CpuSub.cpp
+++ b/src/cpu/operators/CpuSub.cpp

@@ -23,17 +23,20 @@
  */
 #include "src/cpu/operators/CpuSub.h"
 
-#include "src/cpu/kernels/CpuSubKernel.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
-
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/cpu/kernels/CpuSubKernel.h"
 
 namespace arm_compute
 {
 namespace cpu
 {
-void CpuSub::configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CpuSub::configure(const ITensorInfo         *src0,
+                       const ITensorInfo         *src1,
+                       ITensorInfo               *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_LOG_PARAMS(src0, src1, dst, policy);
@@ -42,7 +45,11 @@
     _kernel = std::move(k);
 }
 
-Status CpuSub::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CpuSub::validate(const ITensorInfo         *src0,
+                        const ITensorInfo         *src1,
+                        const ITensorInfo         *dst,
+                        ConvertPolicy              policy,
+                        const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return kernels::CpuSubKernel::validate(src0, src1, dst, policy);

diff --git a/src/cpu/operators/CpuSub.h b/src/cpu/operators/CpuSub.h
index 8890863..d1782a1 100644
--- a/src/cpu/operators/CpuSub.h
+++ b/src/cpu/operators/CpuSub.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_SUB_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/cpu/ICpuOperator.h"
 
 namespace arm_compute
@@ -53,14 +54,22 @@
      * @param[in]  policy   Policy to use to handle overflow. Convert policy cannot be WRAP if datatype is quantized.
      * @param[in]  act_info (Optional) Activation layer information in case of a fused activation. Currently not supported.
      */
-    void configure(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ITensorInfo         *src0,
+                   const ITensorInfo         *src1,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref CpuSub::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src0,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;

diff --git a/src/cpu/operators/CpuTranspose.cpp b/src/cpu/operators/CpuTranspose.cpp
index 4e7854f..ea548e0 100644
--- a/src/cpu/operators/CpuTranspose.cpp
+++ b/src/cpu/operators/CpuTranspose.cpp

@@ -23,9 +23,8 @@
  */
 #include "src/cpu/operators/CpuTranspose.h"
 
-#include "src/cpu/kernels/CpuTransposeKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/cpu/kernels/CpuTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -43,5 +42,5 @@
 {
     return kernels::CpuTransposeKernel::validate(src, dst);
 }
-} // namesapce cpu
+} // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp
index c4edd89..9d07736 100644
--- a/src/cpu/operators/CpuWinogradConv2d.cpp
+++ b/src/cpu/operators/CpuWinogradConv2d.cpp

@@ -22,23 +22,25 @@
  * SOFTWARE.
  */
 #include "src/cpu/operators/CpuWinogradConv2d.h"
+
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/NEON/kernels/assembly/winograd.hpp"
 #include "src/core/NEON/kernels/convolution/common/tensor.hpp"
 #include "src/core/NEON/kernels/convolution/common/utils.hpp"
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuActivation.h"
 #include "src/cpu/operators/CpuPermute.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
@@ -56,21 +58,26 @@
 inline Tensor4DShape internal_get_shape(const ITensorInfo *in)
 {
     const DataLayout data_layout = in->data_layout();
-    const int        in_width    = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
-    const int        in_height   = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
-    const int        in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
-    const int        in_batches  = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
+    const int        in_width = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
+    const int in_height   = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
+    const int in_channels = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
+    const int in_batches  = in->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES));
 
-    return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
+    return Tensor4DShape{in_batches, in_height, in_width, in_channels};
 }
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *biases,
+                          const ITensorInfo   *dst,
+                          const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_UNUSED(dst, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
-    if(biases != nullptr)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+                                    "Winograd layer only supports unit strides.");
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
@@ -80,43 +87,46 @@
     return Status{};
 }
 
-bool get_winograd_kernel_implementation(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                                        const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math,
-                                        arm_conv::winograd::WinogradImpl *winograd_impl, std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
+bool get_winograd_kernel_implementation(const ITensorInfo                          *src,
+                                        const ITensorInfo                          *weights,
+                                        const ITensorInfo                          *dst,
+                                        const PadStrideInfo                        &conv_info,
+                                        const ActivationLayerInfo                  &act_info,
+                                        bool                                        enable_fast_math,
+                                        arm_conv::winograd::WinogradImpl           *winograd_impl,
+                                        std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
 {
     arm_conv::winograd::WinogradConfig winograd_cfg;
     arm_gemm::GemmConfig               cfg;
 
     const DataType data_type = src->data_type();
-    Tensor4DShape  in_shape{ internal_get_shape(src) };
-    Tensor4DShape  out_shape{ internal_get_shape(dst) };
-    Tensor4DShape  kernel_shape{ internal_get_shape(weights) };
+    Tensor4DShape  in_shape{internal_get_shape(src)};
+    Tensor4DShape  out_shape{internal_get_shape(dst)};
+    Tensor4DShape  kernel_shape{internal_get_shape(weights)};
     uint32_t       nthreads = NEScheduler::get().num_threads();
     // Get configuration arguments for Winograd
     winograd_cfg.output_rows = 0;
     winograd_cfg.output_cols = 0;
     conv_args                = std::make_unique<arm_conv::ConvolutionArgs>(
-                                   in_shape.n_batches,
-                                   arm_conv::Shape2D{ static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols) },
-                                   in_shape.n_channels,
-                                   conv_info.pad_top(),
-                                   conv_info.pad_left(),
-                                   arm_conv::Shape2D{ static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols) },
-                                   out_shape.n_channels,
-                                   arm_conv::Shape2D{ static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols) },
-                                   assembly_utils::map_to_arm_gemm_activation(act_info));
+        in_shape.n_batches,
+        arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)},
+        in_shape.n_channels, conv_info.pad_top(), conv_info.pad_left(),
+        arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)},
+        out_shape.n_channels,
+        arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)},
+        assembly_utils::map_to_arm_gemm_activation(act_info));
 
     bool success = false;
-    if(data_type == DataType::F32)
+    if (data_type == DataType::F32)
     {
-        success = arm_conv::winograd::get_implementation<float>(
-                      *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+        success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+                                                                enable_fast_math, &winograd_cfg, nullptr);
     }
 #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
-    else if(data_type == DataType::F16)
+    else if (data_type == DataType::F16)
     {
-        success = arm_conv::winograd::get_implementation<__fp16>(
-                      *winograd_impl, &CPUInfo::get(), *conv_args, nthreads, enable_fast_math, &winograd_cfg, nullptr);
+        success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &CPUInfo::get(), *conv_args, nthreads,
+                                                                 enable_fast_math, &winograd_cfg, nullptr);
     }
 #endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
     else
@@ -127,7 +137,8 @@
 }
 inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
 {
-    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
+    return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
+           act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
 }
 } // namespace
 
@@ -141,7 +152,7 @@
       _permute_output(std::make_unique<CpuPermute>()),
       _permute_weights(std::make_unique<CpuPermute>()),
       _aux_mem(AuxTensorIdx::Count),
-      _conv_args{ nullptr },
+      _conv_args{nullptr},
       _winograd_impl{},
       _data_layout(),
       _winograd_transformed_input{},
@@ -152,15 +163,20 @@
       _weights_hwio(),
       _input_nhwc(),
       _output_nhwc(),
-      _is_prepared{ false },
-      _run_activation{ false }
+      _is_prepared{false},
+      _run_activation{false}
 {
 }
 
 CpuWinogradConv2d::~CpuWinogradConv2d() = default;
 
-void CpuWinogradConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst,
-                                  const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CpuWinogradConv2d::configure(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  ITensorInfo               *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
@@ -169,21 +185,29 @@
     const DataType data_type = src->data_type();
     uint32_t       nthreads  = NEScheduler::get().num_threads();
     _data_layout             = src->data_layout();
-    const Tensor4DShape kernel_shape{ internal_get_shape(weights) };
+    const Tensor4DShape kernel_shape{internal_get_shape(weights)};
 
-    bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &_winograd_impl, _conv_args);
+    bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+                                                      &_winograd_impl, _conv_args);
 
-    ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", _winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_EXIT_ON_MSG_VAR(!success, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+                                kernel_shape.n_cols);
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+                                        _winograd_impl.input_transform->get_name().c_str());
 
-    const bool has_impl = ((_winograd_impl.input_transform != nullptr) && (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
-    if(has_impl)
+    const bool has_impl = ((_winograd_impl.input_transform != nullptr) &&
+                           (_winograd_impl.output_transform != nullptr) && (_winograd_impl.gemm_args != nullptr));
+    if (has_impl)
     {
         // Determine how much working space is required, allocate it.
-        const size_t input_workspace_size  = _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
-        const size_t output_workspace_size = _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
+        const size_t input_workspace_size =
+            _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
+        const size_t output_workspace_size =
+            _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
 
         TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, DataType::U8);
         TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, DataType::U8);
@@ -232,7 +256,7 @@
         PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
 
         // Configure the kernel to transform the input tensor from NCHW -> NHWC
-        if(_data_layout == DataLayout::NCHW)
+        if (_data_layout == DataLayout::NCHW)
         {
             _permute_input->configure(src, &_input_nhwc, PermutationVector(2U, 0U, 1U));
             weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
@@ -242,28 +266,30 @@
         _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
 
         // Reorder the convoluted output to ACL's ordering NCHW
-        if(_data_layout == DataLayout::NCHW)
+        if (_data_layout == DataLayout::NCHW)
         {
             // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
-            TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0),
-                                        dst->dimension(1), dst->dimension(3)),
-                            1, dst->data_type());
+            TensorInfo info(TensorShape(dst->dimension(2), dst->dimension(0), dst->dimension(1), dst->dimension(3)), 1,
+                            dst->data_type());
             _output_nhwc = info;
             _permute_output->configure(&_output_nhwc, dst, PermutationVector(1U, 2U, 0U));
         }
 
         // Configure input transform kernel
-        _transform_input_kernel = std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
+        _transform_input_kernel =
+            std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
 
         // Configure GEMM function
-        _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr, &_winograd_transformed_output, 1.0f, 0.f);
+        _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights, nullptr,
+                                  &_winograd_transformed_output, 1.0f, 0.f);
 
         // Configure output transform kernel
-        _transform_output_kernel = std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
+        _transform_output_kernel =
+            std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
 
         //Configure Activation Layer
         _run_activation = act_info.enabled() && !fuse_function_supported(act_info);
-        if(_run_activation)
+        if (_run_activation)
         {
             _activation_func->configure(dst, nullptr, act_info);
         }
@@ -276,40 +302,55 @@
         _aux_mem[TempResult]     = asm_mem_req[TempResult];
 
         // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
-        _aux_mem[TransformedInput]   = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary, wds.input_matrix_size_bytes, storage_alignment);
-        _aux_mem[TransformedOutput]  = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary, wds.output_matrix_size_bytes, storage_alignment);
-        _aux_mem[WorkspaceIO]        = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary, std::max(input_workspace_size, output_workspace_size));
-        _aux_mem[PermutedWeights]    = MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
-        _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent, wds.weight_matrix_size_bytes, storage_alignment);
-        if(_data_layout == DataLayout::NCHW)
+        _aux_mem[TransformedInput]  = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary,
+                                                 wds.input_matrix_size_bytes, storage_alignment);
+        _aux_mem[TransformedOutput] = MemoryInfo(offset_int_vec(TransformedOutput), MemoryLifetime::Temporary,
+                                                 wds.output_matrix_size_bytes, storage_alignment);
+        _aux_mem[WorkspaceIO]       = MemoryInfo(offset_int_vec(WorkspaceIO), MemoryLifetime::Temporary,
+                                                 std::max(input_workspace_size, output_workspace_size));
+        _aux_mem[PermutedWeights] =
+            MemoryInfo(offset_int_vec(PermutedWeights), MemoryLifetime::Prepare, _weights_hwio.total_size());
+        _aux_mem[TransformedWeights] = MemoryInfo(offset_int_vec(TransformedWeights), MemoryLifetime::Persistent,
+                                                  wds.weight_matrix_size_bytes, storage_alignment);
+        if (_data_layout == DataLayout::NCHW)
         {
             _aux_mem[PermutedInput].merge(offset_int_vec(PermutedInput), src->total_size());
             _aux_mem[PermutedOutput].merge(offset_int_vec(PermutedOutput), dst->total_size());
         }
     }
 }
-Status CpuWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CpuWinogradConv2d::validate(const ITensorInfo         *src,
+                                   const ITensorInfo         *weights,
+                                   const ITensorInfo         *biases,
+                                   const ITensorInfo         *dst,
+                                   const PadStrideInfo       &conv_info,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info));
 
     // Disable winograd for fp16 if fast math is false.
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32);
     }
 
-    const Tensor4DShape              kernel_shape{ internal_get_shape(weights) };
+    const Tensor4DShape              kernel_shape{internal_get_shape(weights)};
     arm_conv::winograd::WinogradImpl winograd_impl{};
 
     std::unique_ptr<arm_conv::ConvolutionArgs> conv_args;
-    const bool                                 success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math, &winograd_impl, conv_args);
+    const bool success = get_winograd_kernel_implementation(src, weights, dst, conv_info, act_info, enable_fast_math,
+                                                            &winograd_impl, conv_args);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows, kernel_shape.n_cols);
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n", winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n", winograd_impl.input_transform->get_name().c_str());
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n", winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(success == false, "Unsupported kernel size: %d x %d.\n", kernel_shape.n_rows,
+                                        kernel_shape.n_cols);
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using input transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using weight transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "Using output transform: %s\n",
+                                        winograd_impl.input_transform->get_name().c_str());
     return Status{};
 }
 
@@ -328,24 +369,29 @@
 
     // Wrap the winograd-domain tensorInfos created in configuration in tensors and allocate the required memory.
     CpuAuxTensorHandler input_nhwc(offset_int_vec(PermutedInput), _input_nhwc, tensors, true);
-    CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input, tensors, true);
+    CpuAuxTensorHandler winograd_input_transformed(offset_int_vec(TransformedInput), _winograd_transformed_input,
+                                                   tensors, true);
     CpuAuxTensorHandler input_workspace(offset_int_vec(WorkspaceIO), _input_workspace, tensors, true);
     const bool          is_nchw = _data_layout == DataLayout::NCHW;
-    if(is_nchw)
+    if (is_nchw)
     {
         //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
-        ITensorPack pack{ { ACL_SRC, src }, { ACL_DST, input_nhwc.get() } };
+        ITensorPack pack{{ACL_SRC, src}, {ACL_DST, input_nhwc.get()}};
         _permute_input->run(pack);
     }
 
-    CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output, tensors, true);
+    CpuAuxTensorHandler winograd_output_transformed(offset_int_vec(TransformedOutput), _winograd_transformed_output,
+                                                    tensors, true);
     CpuAuxTensorHandler output_workspace(offset_int_vec(WorkspaceIO), _output_workspace, tensors, true);
     CpuAuxTensorHandler output_nhwc(offset_int_vec(PermutedOutput), _output_nhwc, tensors, true);
 
-    ITensorPack transform_input_pack{ { ACL_SRC, is_nchw ? input_nhwc.get() : src }, { ACL_DST, winograd_input_transformed.get() }, { ACL_INT, input_workspace.get() } };
+    ITensorPack transform_input_pack{{ACL_SRC, is_nchw ? input_nhwc.get() : src},
+                                     {ACL_DST, winograd_input_transformed.get()},
+                                     {ACL_INT, input_workspace.get()}};
     NEScheduler::get().schedule_op(_transform_input_kernel.get(), Window::DimX, win, transform_input_pack);
 
-    CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights, tensors, true);
+    CpuAuxTensorHandler winograd_weights_transformed(offset_int_vec(TransformedWeights), _winograd_transformed_weights,
+                                                     tensors, true);
 
     // Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
     ITensorPack gemm_pack = tensors;
@@ -356,30 +402,34 @@
     _gemm_function->run(gemm_pack);
 
     // Output transform
-    ITensorPack transform_output_pack{ { ACL_SRC_0, winograd_output_transformed.get() }, { ACL_DST, is_nchw ? output_nhwc.get() : output }, { ACL_SRC_1, biases }, { ACL_INT, output_workspace.get() } };
+    ITensorPack transform_output_pack{{ACL_SRC_0, winograd_output_transformed.get()},
+                                      {ACL_DST, is_nchw ? output_nhwc.get() : output},
+                                      {ACL_SRC_1, biases},
+                                      {ACL_INT, output_workspace.get()}};
     NEScheduler::get().schedule_op(_transform_output_kernel.get(), Window::DimX, win, transform_output_pack);
-    if(is_nchw)
+    if (is_nchw)
     {
         // Reorder the convoluted output to ACL's ordering NCHW
-        ITensorPack pack{ { ACL_SRC, output_nhwc.get() }, { ACL_DST, output } };
+        ITensorPack pack{{ACL_SRC, output_nhwc.get()}, {ACL_DST, output}};
         _permute_output->run(pack);
     }
-    if(_run_activation)
+    if (_run_activation)
     {
-        ITensorPack pack{ { ACL_SRC, output }, { ACL_DST, output } };
+        ITensorPack pack{{ACL_SRC, output}, {ACL_DST, output}};
         _activation_func->run(pack);
     }
 }
 
 void CpuWinogradConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        const ITensor *weights     = tensors.get_const_tensor(ACL_SRC_1);
-        ITensor       *weights_aux = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
+        const ITensor *weights = tensors.get_const_tensor(ACL_SRC_1);
+        ITensor       *weights_aux =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(PermutedWeights)));
 
         CpuAuxTensorHandler permuted_weights(_weights_hwio, *weights_aux);
-        ITensorPack         permute_tensors{ { ACL_SRC, weights }, { ACL_DST, permuted_weights.get() } };
+        ITensorPack         permute_tensors{{ACL_SRC, weights}, {ACL_DST, permuted_weights.get()}};
         _permute_weights->run(permute_tensors);
         const int element_size_in_bytes = permuted_weights.get()->info()->element_size();
         // Weights were in OHWI format, before being permuted "permuted_weights" to be in HWIO format.
@@ -387,31 +437,32 @@
         const unsigned int width_idx   = 2; // W in HWIO
         const unsigned int channel_idx = 1; // I in HWIO
 
-        const int permuted_weight_row_stride     = permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
-        const int permuted_weight_col_stride     = permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
-        const int permuted_weight_channel_stride = permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
+        const int permuted_weight_row_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[height_idx] / element_size_in_bytes;
+        const int permuted_weight_col_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[width_idx] / element_size_in_bytes;
+        const int permuted_weight_channel_stride =
+            permuted_weights.get()->info()->strides_in_bytes()[channel_idx] / element_size_in_bytes;
 
         // Wrap the winograd-domain transformed weight TensorInfo in Auxiliary tensor and allocate the required memory.
-        ITensor *weights_transf = utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
+        ITensor *weights_transf =
+            utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransformedWeights)));
         ARM_COMPUTE_ERROR_ON_NULLPTR(weights_transf);
         CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf);
 
         const void *permuted_weights_ptr;
         void       *win_wght_transf_ptr;
 
-        permuted_weights_ptr = reinterpret_cast<const void *>(permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
-        win_wght_transf_ptr  = reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() + winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
+        permuted_weights_ptr = reinterpret_cast<const void *>(
+            permuted_weights.get()->buffer() + permuted_weights.get()->info()->offset_first_element_in_bytes());
+        win_wght_transf_ptr =
+            reinterpret_cast<void *>(winograd_transformed_weights.get()->buffer() +
+                                     winograd_transformed_weights.get()->info()->offset_first_element_in_bytes());
 
         // Prepare Weights
         _winograd_impl.weight_transform->execute(
-            *_conv_args,
-            permuted_weights_ptr,
-            permuted_weight_row_stride,
-            permuted_weight_col_stride,
-            permuted_weight_channel_stride,
-            win_wght_transf_ptr,
-            _winograd_impl.winograd_spec,
-            0, 1 // Thread 1 of 1
+            *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride,
+            permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1 // Thread 1 of 1
         );
         ITensorPack gemm_pack = tensors;
         gemm_pack.add_const_tensor(ACL_SRC_1, winograd_transformed_weights.get());

diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h
index e0df34e..7e1d952 100644
--- a/src/cpu/operators/CpuWinogradConv2d.h
+++ b/src/cpu/operators/CpuWinogradConv2d.h

@@ -26,10 +26,11 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/kernels/assembly/gemm_common.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuActivation.h"
 #include "src/cpu/operators/CpuGemm.h"
 #include "src/cpu/operators/CpuPermute.h"
@@ -73,7 +74,11 @@
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const ITensorInfo         *src,
+                   const ITensorInfo         *weights,
+                   const ITensorInfo         *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
                    bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration of @ref CpuWinogradConv2d
@@ -82,13 +87,17 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
                            bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -108,27 +117,28 @@
         PermutedOutput     = TransformedInput,
         Count              = 10
     };
-    std::unique_ptr<CpuGemm>                   _gemm_function;
-    std::unique_ptr<CpuActivation>             _activation_func;
-    std::unique_ptr<ICPPKernel>                _transform_input_kernel;
-    std::unique_ptr<ICPPKernel>                _transform_output_kernel;
-    std::unique_ptr<CpuPermute>                _permute_input;
-    std::unique_ptr<CpuPermute>                _permute_output;
-    std::unique_ptr<CpuPermute>                _permute_weights;
-    experimental::MemoryRequirements           _aux_mem{ Count };
-    std::unique_ptr<arm_conv::ConvolutionArgs> _conv_args; // Make it unique ptr because this type does not have a default constructor
-    arm_conv::winograd::WinogradImpl           _winograd_impl;
-    DataLayout                                 _data_layout;
-    TensorInfo                                 _winograd_transformed_input;
-    TensorInfo                                 _winograd_transformed_output;
-    TensorInfo                                 _winograd_transformed_weights;
-    TensorInfo                                 _input_workspace;
-    TensorInfo                                 _output_workspace;
-    TensorInfo                                 _weights_hwio;
-    TensorInfo                                 _input_nhwc;
-    TensorInfo                                 _output_nhwc;
-    bool                                       _is_prepared;
-    bool                                       _run_activation;
+    std::unique_ptr<CpuGemm>         _gemm_function;
+    std::unique_ptr<CpuActivation>   _activation_func;
+    std::unique_ptr<ICPPKernel>      _transform_input_kernel;
+    std::unique_ptr<ICPPKernel>      _transform_output_kernel;
+    std::unique_ptr<CpuPermute>      _permute_input;
+    std::unique_ptr<CpuPermute>      _permute_output;
+    std::unique_ptr<CpuPermute>      _permute_weights;
+    experimental::MemoryRequirements _aux_mem{Count};
+    std::unique_ptr<arm_conv::ConvolutionArgs>
+        _conv_args; // Make it unique ptr because this type does not have a default constructor
+    arm_conv::winograd::WinogradImpl _winograd_impl;
+    DataLayout                       _data_layout;
+    TensorInfo                       _winograd_transformed_input;
+    TensorInfo                       _winograd_transformed_output;
+    TensorInfo                       _winograd_transformed_weights;
+    TensorInfo                       _input_workspace;
+    TensorInfo                       _output_workspace;
+    TensorInfo                       _weights_hwio;
+    TensorInfo                       _input_nhwc;
+    TensorInfo                       _output_nhwc;
+    bool                             _is_prepared;
+    bool                             _run_activation;
 };
 } // namespace cpu
 } // namespace arm_compute

diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 3069d6b..343ef21 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp

@@ -24,12 +24,13 @@
 #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 #include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/arm_gemm/utils.hpp"
 #include "src/core/utils/AssemblyUtils.h"
-#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
+#include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
 
 #include <arm_neon.h>
@@ -53,7 +54,12 @@
  * @param[in] num_threads      Number of threads to run this method. Must be >= 1
  */
 template <typename TypeInput, typename TypeOutput>
-void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm, ITensor *dst, const TypeInput *src, int src_ld, int src_multi_stride, unsigned int num_threads)
+void run_parallel_pretranspose_B_array(arm_gemm::GemmCommon<TypeInput, TypeOutput> *gemm_asm,
+                                       ITensor                                     *dst,
+                                       const TypeInput                             *src,
+                                       int                                          src_ld,
+                                       int                                          src_multi_stride,
+                                       unsigned int                                 num_threads)
 {
     ARM_COMPUTE_ERROR_ON(gemm_asm == nullptr);
     ARM_COMPUTE_ERROR_ON(num_threads == 0);
@@ -61,14 +67,14 @@
     const unsigned int wsize = gemm_asm->get_B_pretranspose_window_size();
 
     std::vector<IScheduler::Workload> workloads(num_threads);
-    for(unsigned int t = 0; t < num_threads; ++t)
+    for (unsigned int t = 0; t < num_threads; ++t)
     {
-        workloads[t] = [ = ](const ThreadInfo & info)
+        workloads[t] = [=](const ThreadInfo &info)
         {
             const unsigned int start = (info.thread_id * wsize) / num_threads;
             const unsigned int end   = ((info.thread_id + 1) * wsize) / num_threads;
 
-            if(start < end)
+            if (start < end)
             {
                 gemm_asm->pretranspose_B_array_part(dst->buffer(), src, src_ld, src_multi_stride, start, end);
             }
@@ -113,7 +119,7 @@
     p.sections = 1;
     p.indirect = false;
 
-    if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
+    if (info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
     {
         p.indirect = true;
         p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
@@ -125,7 +131,7 @@
     }
 
     // Update M in case of GEMM3D for output
-    if(info.depth_output_gemm3d != 0)
+    if (info.depth_output_gemm3d != 0)
     {
         p.M       = d->tensor_shape().y() * d->tensor_shape().z();
         p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
@@ -139,19 +145,24 @@
     // Schedule assembly kernel
     const int         granule_threshold = 200;
     IScheduler::Hints scheduling_hint   = IScheduler::Hints(Window::DimX);
-    if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
+    if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && data_type == DataType::F32)
     {
         scheduling_hint = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
     }
-    else if(method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 || data_type == DataType::S8))
+    else if (method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D &&
+             (data_type == DataType::F32 || data_type == DataType::F16 || data_type == DataType::U8 ||
+              data_type == DataType::S8))
     {
         //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
-        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+        scheduling_hint =
+            IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
     }
-    else if(method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D && (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
+    else if (method == arm_gemm::GemmMethod::QUANTIZE_WRAPPER_2D &&
+             (data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED))
     {
         //special case for QASYMM8 to support 2D parallelism, scheduler here may be tweaked differently compared to FP32 case
-        scheduling_hint = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
+        scheduling_hint =
+            IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
     }
 
     return scheduling_hint;
@@ -175,8 +186,12 @@
      * @param[in]  gemm_info GEMM meta-data
      * @param[in]  os        Output stage meta-data.
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                   arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+    void configure(const ITensorInfo *a,
+                   const ITensorInfo *b,
+                   const ITensorInfo *c,
+                   ITensorInfo       *d,
+                   arm_gemm::GemmArgs args,
+                   const AsmGemmInfo &gemm_info,
                    const OutputStage &os = {});
 
     /** Set requantization shifts to be used
@@ -193,19 +208,20 @@
       *
       * @return A tuple with the pointers to the shift and multiplier data respectively
       */
-    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *> set_requantize_data(const std::vector<int32_t> &shifts,
-                                                                                            const std::vector<int32_t> &multipliers);
+    std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
+    set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     bool                             is_configured() const override;
     experimental::MemoryRequirements workspace() const override;
     bool                             isVarWeightsKernel() const override
     {
-        if(!_gemm_kernel_asm)
+        if (!_gemm_kernel_asm)
             return false;
-        const arm_compute::WeightFormat wf = assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
+        const arm_compute::WeightFormat wf =
+            assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format);
         return wf != arm_compute::WeightFormat::UNSPECIFIED && wf != arm_compute::WeightFormat::ANY;
     }
 
@@ -229,15 +245,15 @@
     void prepare_indirect_buffer(ITensorPack &tensors);
 
     /** Assembly Gemm kernel */
-    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
+    std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr};
     /** Optimised Arm® Neon™ kernel */
-    std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
+    std::unique_ptr<INEKernel> _optimised_kernel{nullptr};
     /** Assembly GEMM workspace tensor info */
     TensorInfo _workspace_info{};
     /** Pre-transpose tensor info */
     TensorInfo _pretranspose_info{};
     /** Prepared flag */
-    bool _is_prepared{ false };
+    bool _is_prepared{false};
     /** GEMM meta-data */
     AsmGemmInfo _gemm_info{};
     /** GEMM kernel description */
@@ -251,26 +267,27 @@
     /** Indirect buffer */
     std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
     std::unique_ptr<const TypeInput *, free_delete>        _indirect_buf{};
-    std::vector<TypeInput>           _indirect_pad{};
-    arm_gemm::ConvolutionParameters  _cp{};
-    experimental::MemoryRequirements _aux_mem{ Count };
-    bool                             _B_pretranspose_required{ false };
-    bool                             _is_b_constant{ true };
-    bool                             _is_c_constant{ true };
+    std::vector<TypeInput>                                 _indirect_pad{};
+    arm_gemm::ConvolutionParameters                        _cp{};
+    experimental::MemoryRequirements                       _aux_mem{Count};
+    bool                                                   _B_pretranspose_required{false};
+    bool                                                   _is_b_constant{true};
+    bool                                                   _is_c_constant{true};
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 std::tuple<bool, const int32_t *, const int32_t *, const int32_t *>
-Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts, const std::vector<int32_t> &multipliers)
+Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vector<int32_t> &shifts,
+                                                                  const std::vector<int32_t> &multipliers)
 {
     _multipliers   = multipliers;
     _shifts        = shifts;
     bool need_left = false;
-    for(const auto s : _shifts)
+    for (const auto s : _shifts)
     {
         left_shifts.push_back(std::max(-s, int32_t(0)));
         right_shifts.push_back(std::min(-s, int32_t(0)));
-        if(s < 0 && !need_left)
+        if (s < 0 && !need_left)
         {
             need_left = true;
         }
@@ -295,32 +312,35 @@
     const int    multi_size   = batch_size * batches;
     const size_t multi_stride = multi_size / sizeof(TypeInput);
 
-    for(int64_t m = 0; m < multis; m++)
+    for (int64_t m = 0; m < multis; m++)
     {
-        for(int64_t b = 0; b < batches; b++)
+        for (int64_t b = 0; b < batches; b++)
         {
-            for(int64_t output_y = 0; output_y < _cp.output_height; output_y++)
+            for (int64_t output_y = 0; output_y < _cp.output_height; output_y++)
             {
-                for(int64_t output_x = 0; output_x < _cp.output_width; output_x++)
+                for (int64_t output_x = 0; output_x < _cp.output_width; output_x++)
                 {
                     int64_t output_xy = (output_y * _cp.output_width) + output_x;
 
-                    for(int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
+                    for (int64_t kernel_y = 0; kernel_y < _cp.kernel_height; kernel_y++)
                     {
-                        for(int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
+                        for (int64_t kernel_x = 0; kernel_x < _cp.kernel_width; kernel_x++)
                         {
                             int64_t input_x   = (output_x * _cp.output_stride_w) + kernel_x - _cp.padding_left;
                             int64_t input_y   = (output_y * _cp.output_stride_h) + kernel_y - _cp.padding_top;
                             int64_t kernel_xy = (kernel_y * _cp.kernel_width) + kernel_x;
                             int64_t input_xy  = (input_y * _cp.input_width) + input_x;
 
-                            if(input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
+                            if (input_x < 0 || input_x >= _cp.input_width || input_y < 0 || input_y >= _cp.input_height)
                             {
-                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] = _indirect_pad.data();
+                                _indirect_buf
+                                    .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                    _indirect_pad.data();
                             }
                             else
                             {
-                                _indirect_buf.get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
+                                _indirect_buf
+                                    .get()[m * multi_stride + b * batch_stride + kernel_xy * output_hw + output_xy] =
                                     A_ptr + (m * multi_stride_A + b * batch_stride_A + input_xy * stride_A);
                             }
                         }
@@ -332,12 +352,15 @@
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITensorInfo *a,
+                                                                      const ITensorInfo *b,
+                                                                      const ITensorInfo *d,
+                                                                      const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON(!(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect));
 
     float zeropad = 0.f;
-    if(is_data_type_quantized(a->data_type()))
+    if (is_data_type_quantized(a->data_type()))
     {
         zeropad = a->quantization_info().uniform().offset;
     }
@@ -350,16 +373,25 @@
     const int64_t output_width   = static_cast<int64_t>(d->tensor_shape()[1]);
     const int64_t output_height  = static_cast<int64_t>(d->tensor_shape()[2]);
 
-    _cp = { input_width, input_height, input_channels, kernel_width, kernel_height, output_width, output_height,
-            info.ps_info.stride().first, info.ps_info.stride().second, info.padding_top, info.padding_left, zeropad
-          };
+    _cp = {input_width,
+           input_height,
+           input_channels,
+           kernel_width,
+           kernel_height,
+           output_width,
+           output_height,
+           info.ps_info.stride().first,
+           info.ps_info.stride().second,
+           info.padding_top,
+           info.padding_left,
+           zeropad};
 
-    if(info.method == AsmConvMethod::Conv)
+    if (info.method == AsmConvMethod::Conv)
     {
         _gemm_kernel_asm->set_convolution_parameters(_cp);
     }
 
-    if(info.method == AsmConvMethod::Indirect)
+    if (info.method == AsmConvMethod::Indirect)
     {
         const unsigned int multis    = 1;
         const unsigned int batches   = a->tensor_shape().total_size_upper(3);
@@ -372,19 +404,22 @@
         const int    multi_size   = batch_size * batches;
         const size_t multi_stride = multi_size / sizeof(TypeInputPtr);
 
-        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
-        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
+        _indirect_buf = std::unique_ptr<const TypeInput *, free_delete>(
+            reinterpret_cast<const TypeInput **>(malloc(multi_size * multis)));
+        _indirect_arg = std::unique_ptr<const TypeInput *const *, free_delete>(
+            reinterpret_cast<const TypeInput *const **>(malloc(sizeof(TypeInput **) * kernel_hw * multis * batches)));
         _indirect_pad = std::vector<TypeInput>(_cp.input_channels, TypeInput(zeropad));
 
         // Set indirect argument
         int64_t pos = 0;
-        for(int64_t m = 0; m < multis; m++)
+        for (int64_t m = 0; m < multis; m++)
         {
-            for(int64_t b = 0; b < batches; b++)
+            for (int64_t b = 0; b < batches; b++)
             {
-                for(int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
+                for (int64_t kernel_xy = 0; kernel_xy < kernel_hw; kernel_xy++)
                 {
-                    (_indirect_arg.get())[pos++] = _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
+                    (_indirect_arg.get())[pos++] =
+                        _indirect_buf.get() + m * multi_stride + b * batch_stride + kernel_xy * output_hw;
                 }
             }
         }
@@ -394,8 +429,12 @@
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                                                             arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a,
+                                                             const ITensorInfo *b,
+                                                             const ITensorInfo *c,
+                                                             ITensorInfo       *d,
+                                                             arm_gemm::GemmArgs args,
+                                                             const AsmGemmInfo &gemm_info,
                                                              const OutputStage &os)
 {
     ARM_COMPUTE_UNUSED(c);
@@ -404,7 +443,7 @@
     _is_c_constant = c ? c->are_values_constant() : true;
 
     _gemm_kernel_asm = arm_gemm::gemm<TypeInput, TypeOutput, OutputStage>(args, os);
-    if(_gemm_kernel_asm == nullptr)
+    if (_gemm_kernel_asm == nullptr)
     {
         //configuration not supported: Leave function unconfigured:
         return;
@@ -419,13 +458,14 @@
     const size_t       workspace_size = _gemm_kernel_asm->get_working_size();
     const unsigned int alignment      = 4096;
     _workspace_info                   = TensorInfo(TensorShape(workspace_size), 1, DataType::U8);
-    _aux_mem[AsmGemmWorkspace]        = MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
+    _aux_mem[AsmGemmWorkspace] =
+        MemoryInfo(offset_int_vec(AsmGemmWorkspace), MemoryLifetime::Temporary, workspace_size, alignment);
 
     //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
     //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
     {
         const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
-        if(window_size < static_cast<unsigned int>(args._maxthreads))
+        if (window_size < static_cast<unsigned int>(args._maxthreads))
         {
             _gemm_kernel_asm->set_nthreads(window_size);
         }
@@ -434,18 +474,19 @@
     _optimised_kernel = std::move(acl_gemm_wrapper);
     _gemm_info        = gemm_info;
     // Check for pre-transposed support
-    if(_gemm_kernel_asm->B_pretranspose_required())
+    if (_gemm_kernel_asm->B_pretranspose_required())
     {
         // Forcing 128-byte alignment (required by 32-bit kernels)
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
         _pretranspose_info                     = TensorInfo(TensorShape(B_pretranspose_size), 1, DataType::U8);
-        _aux_mem[Pretranspose]                 = MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
-        _B_pretranspose_required               = true;
+        _aux_mem[Pretranspose] =
+            MemoryInfo(offset_int_vec(Pretranspose), MemoryLifetime::Persistent, B_pretranspose_size, alignment);
+        _B_pretranspose_required = true;
     }
 
     // Handle indirect GEMM convolution
-    if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
+    if (gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
     {
         configure_indirect(a, b, d, gemm_info);
     }
@@ -454,34 +495,39 @@
 template <typename TypeInput, typename TypeOutput, class OutputStage>
 void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
 
         // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
-        if(c && c->info()->data_type() == DataType::S32)
+        if (c && c->info()->data_type() == DataType::S32)
         {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+            _gemm_kernel_asm->set_quantized_bias(
+                reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
         }
 
         // Pretranspose B if required
-        if(_gemm_kernel_asm->B_pretranspose_required())
+        if (_gemm_kernel_asm->B_pretranspose_required())
         {
             // Fixed format kernels need no pretranspose.
-            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
-            const int  ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
-            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-            const int  multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+                assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+            const int  ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
+            const auto in1_ptr =
+                reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+            const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
 
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
-            run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), in1_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+            run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+                                                                     in1_ptr, ldb, multi_stride_b,
+                                                                     NEScheduler::get().num_threads());
 
             b->mark_as_unused();
         }
 
-        if(_gemm_info.method == AsmConvMethod::Indirect)
+        if (_gemm_info.method == AsmConvMethod::Indirect)
         {
             prepare_indirect_buffer(tensors);
         }
@@ -526,12 +572,12 @@
     int       multi_stride_b = 0;
     const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / d->info()->element_size();
 
-    auto             in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
+    auto in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
     const TypeInput *in1_ptr = nullptr;
     auto             out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
 
     // Check if B is pre-tranposed and de-reference if not
-    if(!_gemm_kernel_asm->B_is_pretransposed())
+    if (!_gemm_kernel_asm->B_is_pretransposed())
     {
         ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
         multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
@@ -539,30 +585,34 @@
     }
 
     // If necessary, run pretranspose every time if either weights or biases are non-constant
-    if((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
+    if ((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
     {
-        if(c && c->info()->data_type() == DataType::S32)
+        if (c && c->info()->data_type() == DataType::S32)
         {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
+            _gemm_kernel_asm->set_quantized_bias(
+                reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
         }
 
         // Pretranspose B if required
-        if(_B_pretranspose_required)
+        if (_B_pretranspose_required)
         {
-            const int  ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
-            const auto b_ptr          = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-            const int  multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+            const int  ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
+            const auto b_ptr =
+                reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+            const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
 
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
 
-            if(_is_b_constant)
+            if (_is_b_constant)
             {
                 _gemm_kernel_asm->requantize_bias(pretranspose.get()->buffer(), b_ptr, ldb, multi_stride_b);
             }
             else
             {
-                run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(), b_ptr, ldb, multi_stride_b, NEScheduler::get().num_threads());
+                run_parallel_pretranspose_B_array<TypeInput, TypeOutput>(_gemm_kernel_asm.get(), pretranspose.get(),
+                                                                         b_ptr, ldb, multi_stride_b,
+                                                                         NEScheduler::get().num_threads());
             }
         }
     }
@@ -571,17 +621,17 @@
 
     // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
     CpuAuxTensorHandler workspace(offset_int_vec(AsmGemmWorkspace), _workspace_info, tensors, false);
-    if(workspace.get()->buffer() != nullptr)
+    if (workspace.get()->buffer() != nullptr)
     {
         _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(workspace.get()->buffer()));
         const unsigned int split_dim   = scheduling_hint.split_dimension();
         const unsigned int window_size = _gemm_kernel_asm->get_window_size().total_size();
         unsigned int       num_threads = NEScheduler::get().num_threads();
-        if(window_size < num_threads)
+        if (window_size < num_threads)
         {
             num_threads = window_size;
         }
-        if(split_dim != IScheduler::split_dimensions_all)
+        if (split_dim != IScheduler::split_dimensions_all)
         {
             // Make sure the kernel does not expect more threads than we can actually spawn
             const unsigned int num_iterations = _optimised_kernel.get()->window().num_iterations(split_dim);
@@ -595,12 +645,12 @@
 
     // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
     TypeOutput *bias = nullptr;
-    if(c && c->info()->data_type() != DataType::S32)
+    if (c && c->info()->data_type() != DataType::S32)
     {
         bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
     }
 
-    if(_gemm_info.method == AsmConvMethod::Indirect)
+    if (_gemm_info.method == AsmConvMethod::Indirect)
     {
         in0_ptr        = nullptr;
         lda            = 0;
@@ -609,18 +659,20 @@
     }
 
     // Set gemm parameters
-    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a,
-                                 in1_ptr, ldb, multi_stride_b,
-                                 out_ptr, ldd, batch_stride_d, multi_stride_d,
-                                 bias, 0);
+    _gemm_kernel_asm->set_arrays(in0_ptr, lda, batch_stride_a, multi_stride_a, in1_ptr, ldb, multi_stride_b, out_ptr,
+                                 ldd, batch_stride_d, multi_stride_d, bias, 0);
     // Schedule
     NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
 }
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
-                     const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                     arm_gemm::Activation activation, const AsmGemmInfo &info)
+                     const ITensorInfo                                   *a,
+                     const ITensorInfo                                   *b,
+                     const ITensorInfo                                   *c,
+                     ITensorInfo                                         *d,
+                     arm_gemm::Activation                                 activation,
+                     const AsmGemmInfo                                   &info)
 {
     Params         p           = extract_parameters(a, b, d, info);
     const CPUInfo &ci          = NEScheduler::get().cpu_info();
@@ -628,7 +680,8 @@
 
     arm_gemm::GemmConfig cfg;
     cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+                            info.fixed_format, info.fast_mode, &cfg);
 
     // Create arm_gemm fallback
     auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput>>();
@@ -638,8 +691,12 @@
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm,
-                           const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
-                           arm_gemm::Activation activation, const AsmGemmInfo &info)
+                           const ITensorInfo                                   *a,
+                           const ITensorInfo                                   *b,
+                           const ITensorInfo                                   *c,
+                           ITensorInfo                                         *d,
+                           arm_gemm::Activation                                 activation,
+                           const AsmGemmInfo                                   &info)
 {
     ARM_COMPUTE_UNUSED(activation);
     Params             p           = extract_parameters(a, b, d, info);
@@ -648,7 +705,8 @@
 
     arm_gemm::GemmConfig cfg;
     cfg.weight_format = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
-    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads, info.fixed_format, info.fast_mode, &cfg);
+    arm_gemm::GemmArgs args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, activation, num_threads,
+                            info.fixed_format, info.fast_mode, &cfg);
 
     // Create arm_gemm fallback
     auto fallback = std::make_unique<Fallback<TypeInput, TypeOutput, arm_gemm::Requantize32>>();
@@ -660,22 +718,20 @@
     const GEMMLowpOutputStageInfo os_info  = info.output_stage;
 
     arm_gemm::Requantize32 gemm_requant_info{};
-    if(os_info.gemmlowp_shifts.size() > 1)
+    if (os_info.gemmlowp_shifts.size() > 1)
     {
-        const auto requantize_data = fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
-        gemm_requant_info          = arm_gemm::Requantize32(nullptr, 0,
-                                                            a_offset, b_offset, os_info.gemmlowp_offset,
-                                                            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr,
-                                                            std::get<2>(requantize_data),
-                                                            std::get<3>(requantize_data),
-                                                            os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+        const auto requantize_data =
+            fallback->set_requantize_data(os_info.gemmlowp_shifts, os_info.gemmlowp_multipliers);
+        gemm_requant_info = arm_gemm::Requantize32(
+            nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset,
+            (std::get<0>(requantize_data)) ? std::get<1>(requantize_data) : nullptr, std::get<2>(requantize_data),
+            std::get<3>(requantize_data), os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
     }
     else
     {
-        gemm_requant_info = arm_gemm::Requantize32(nullptr, 0,
-                                                   a_offset, b_offset, os_info.gemmlowp_offset,
-                                                   -os_info.gemmlowp_shift, os_info.gemmlowp_multiplier,
-                                                   os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
+        gemm_requant_info =
+            arm_gemm::Requantize32(nullptr, 0, a_offset, b_offset, os_info.gemmlowp_offset, -os_info.gemmlowp_shift,
+                                   os_info.gemmlowp_multiplier, os_info.gemmlowp_min_bound, os_info.gemmlowp_max_bound);
     }
 
     // Configure fallback
@@ -684,13 +740,16 @@
 }
 } //namespace
 
-CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch()
-    : _arm_gemm(nullptr)
+CpuGemmAssemblyDispatch::CpuGemmAssemblyDispatch() : _arm_gemm(nullptr)
 {
 }
 
-Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d,
-                                             const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                             const ITensorInfo         *a,
+                                             const ITensorInfo         *b,
+                                             const ITensorInfo         *c,
+                                             const ITensorInfo         *d,
+                                             const AsmGemmInfo         &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_UNUSED(c);
@@ -701,53 +760,61 @@
     arm_gemm::GemmConfig cfg;
     cfg.weight_format                           = assembly_utils::map_to_arm_gemm_weight_format(info.weight_format);
     arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
-    arm_gemm::GemmArgs     args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads, info.fixed_format, info.fast_mode, &cfg);
-    switch(a->data_type())
+    arm_gemm::GemmArgs     args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads,
+                                info.fixed_format, info.fast_mode, &cfg);
+    switch (a->data_type())
     {
         case DataType::F32:
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                            "We could not find an optimized kernel for F32 input");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<float, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for F32 input");
             break;
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<uint8_t, uint32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for U8/QASYMM8 input and U32 output");
             }
             else
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for U8 input and U8 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<uint8_t, uint8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for U8 input and U8 output");
             }
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<int8_t, int32_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for S8/QASYMM8_SIGNED input and S32 output");
             }
             else
             {
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
-                                                "We could not find an optimized kernel for S8 input and S8 output");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                    !(arm_gemm::has_opt_gemm<int8_t, int8_t, arm_gemm::Requantize32>(arm_gemm_expected_wf, args, {})),
+                    "We could not find an optimized kernel for S8 input and S8 output");
             }
             break;
 #endif /* __aarch64__ */
 #if defined(ARM_COMPUTE_ENABLE_BF16)
         case DataType::BFLOAT16:
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                            "We could not find an optimized kernel for BFLOAT16 input and F32 output");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<bfloat16, float, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for BFLOAT16 input and F32 output");
             break;
         }
 #endif /* defined(ARM_COMPUTE_ENABLE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
         case DataType::F16:
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
-                                            "We could not find an optimized kernel for F16 input and F16 output");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                !(arm_gemm::has_opt_gemm<float16_t, float16_t, arm_gemm::Nothing>(arm_gemm_expected_wf, args, {})),
+                "We could not find an optimized kernel for F16 input and F16 output");
             break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
         default:
@@ -759,26 +826,30 @@
     return Status{};
 }
 
-Status CpuGemmAssemblyDispatch::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
+Status CpuGemmAssemblyDispatch::validate(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_UNUSED(c, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(a, b, d);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run), "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(info.reshape_b_only_on_first_run),
+                                    "Assembly kernel will not be executed when reshape_b_only_on_first_run is false");
 
 #ifndef __aarch64__
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->element_size() == 1, "8bit integer types only supported for aarch64");
 #endif /* __aarch64__ */
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
-                                                         DataType::BFLOAT16, DataType::F16, DataType::F32);
-    if(is_data_type_quantized_per_channel(b->data_type()))
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S8, DataType::BFLOAT16,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+        b, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S8,
+        DataType::BFLOAT16, DataType::F16, DataType::F32);
+    if (is_data_type_quantized_per_channel(b->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8_SIGNED, DataType::S8);
     }
-    else if(is_fixed_format_fast_math(info.weight_format))
+    else if (is_fixed_format_fast_math(info.weight_format))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
@@ -787,22 +858,29 @@
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32, "Only F32 output supported for F32 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16, "Only F16 output supported for F16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32, "Only F32 output supported for BFLOAT16 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32, "Only U32 output supported for U8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32, "Only S32 output supported for S8 input");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 && (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F32 && d->data_type() != DataType::F32,
+                                    "Only F32 output supported for F32 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::F16 && d->data_type() != DataType::F16,
+                                    "Only F16 output supported for F16 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::BFLOAT16 && d->data_type() != DataType::F32,
+                                    "Only F32 output supported for BFLOAT16 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::U8 && d->data_type() != DataType::U32,
+                                    "Only U32 output supported for U8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::S8 && d->data_type() != DataType::S32,
+                                    "Only S32 output supported for S8 input");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->data_type() == DataType::QASYMM8 &&
+                                        (d->data_type() != DataType::QASYMM8 && d->data_type() != DataType::S32),
                                     "Only QASYMM8/S32 output supported for QASYMM8 input");
     arm_compute::WeightFormat expected_weight_format = arm_compute::WeightFormat::UNSPECIFIED;
     const Status              ret = CpuGemmAssemblyDispatch::has_opt_impl(expected_weight_format, a, b, c, d, info);
-    if((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
+    if ((bool)ret && expected_weight_format != arm_compute::WeightFormat::ANY)
     {
         // Correctness check: if the format expected by the kernel is
         // not "any", make sure that the one found matches the format
         // intended by the caller.
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((expected_weight_format != info.weight_format),
-                                        "The format expected by the kernel does not correspond with the one requested by the user.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            (expected_weight_format != info.weight_format),
+            "The format expected by the kernel does not correspond with the one requested by the user.");
     }
     return ret;
 }
@@ -813,18 +891,19 @@
     return act.type != arm_gemm::Activation::Type::None;
 }
 
-void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
+void CpuGemmAssemblyDispatch::configure(
+    const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     arm_gemm::Activation act = assembly_utils::map_to_arm_gemm_activation(info.activation_info);
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
+    if (!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
     {
         return;
     }
 
-    switch(a->data_type())
+    switch (a->data_type())
     {
         case DataType::F32:
             create_arm_gemm<float, float>(_arm_gemm, a, b, c, d, act, info);
@@ -832,7 +911,7 @@
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
                 create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, a, b, c, d, act, info);
             }
@@ -843,7 +922,7 @@
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
-            if(d->data_type() == DataType::S32)
+            if (d->data_type() == DataType::S32)
             {
                 create_arm_gemm<int8_t, int32_t>(_arm_gemm, a, b, c, d, act, info);
             }

diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index ceb7a3f..5be39a5 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
 
@@ -42,20 +43,20 @@
 
 struct AsmGemmInfo
 {
-    AsmConvMethod             method{ AsmConvMethod::Im2Col };
+    AsmConvMethod             method{AsmConvMethod::Im2Col};
     PadStrideInfo             ps_info{};
     ActivationLayerInfo       activation_info{};
     GEMMLowpOutputStageInfo   output_stage{};
-    bool                      negated_offsets{ true };
-    bool                      reinterpret_input_as_3d{ false };
-    bool                      depth_output_gemm3d{ false };
-    int64_t                   padding_top{ 0 };
-    int64_t                   padding_left{ 0 };
-    float                     padding_value{ 0.f };
-    bool                      fast_mode{ false };
-    bool                      fixed_format{ false };
-    arm_compute::WeightFormat weight_format{ arm_compute::WeightFormat::UNSPECIFIED };
-    bool                      reshape_b_only_on_first_run{ true };
+    bool                      negated_offsets{true};
+    bool                      reinterpret_input_as_3d{false};
+    bool                      depth_output_gemm3d{false};
+    int64_t                   padding_top{0};
+    int64_t                   padding_left{0};
+    float                     padding_value{0.f};
+    bool                      fast_mode{false};
+    bool                      fixed_format{false};
+    arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED};
+    bool                      reshape_b_only_on_first_run{true};
 };
 
 /** Assembly kernel glue */
@@ -72,12 +73,12 @@
     class IFallback
     {
     public:
-        virtual void run(ITensorPack &tensors)                              = 0;
-        virtual void prepare(ITensorPack &tensors)                          = 0;
-        virtual experimental::MemoryRequirements workspace() const          = 0;
-        virtual bool                             is_configured() const      = 0;
-        virtual bool                             isVarWeightsKernel() const = 0;
-        virtual ~IFallback()                                                = default;
+        virtual void                             run(ITensorPack &tensors)     = 0;
+        virtual void                             prepare(ITensorPack &tensors) = 0;
+        virtual experimental::MemoryRequirements workspace() const             = 0;
+        virtual bool                             is_configured() const         = 0;
+        virtual bool                             isVarWeightsKernel() const    = 0;
+        virtual ~IFallback()                                                   = default;
     };
 
 public:
@@ -121,7 +122,8 @@
      * @param[out] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
      * @param[in]  info GEMM meta-data
      */
-    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
+    void configure(
+        const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
 
     /** Indicates whether or not this function can be used to process the given parameters.
      *
@@ -133,7 +135,11 @@
      *
      * @return a status.
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *d,
+                           const AsmGemmInfo &info);
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
@@ -144,7 +150,12 @@
      *
      * @return a status.
      */
-    static Status has_opt_impl(arm_compute::WeightFormat &weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *d, const AsmGemmInfo &info);
+    static Status has_opt_impl(arm_compute::WeightFormat &weight_format,
+                               const ITensorInfo         *a,
+                               const ITensorInfo         *b,
+                               const ITensorInfo         *c,
+                               const ITensorInfo         *d,
+                               const AsmGemmInfo         &info);
     /** Checks if activation is supported by the gemm assembly dispatcher
      *
      * @param[in] activation Activation to check
@@ -167,8 +178,8 @@
     }
 
     // Inherited methods overridden:
-    void prepare(ITensorPack &tensors) override;
-    void run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h
index ae1cffb..e23b88a 100644
--- a/src/cpu/utils/CpuAuxTensorHandler.h
+++ b/src/cpu/utils/CpuAuxTensorHandler.h

@@ -39,25 +39,26 @@
 class CpuAuxTensorHandler
 {
 public:
-    CpuAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+    CpuAuxTensorHandler(
+        int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
         : _tensor()
     {
-        if(info.total_size() == 0)
+        if (info.total_size() == 0)
         {
             return;
         }
         _tensor.allocator()->soft_init(info);
 
         ITensor *packed_tensor = utils::cast::polymorphic_downcast<ITensor *>(pack.get_tensor(slot_id));
-        if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
+        if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
         {
-            if(!bypass_alloc)
+            if (!bypass_alloc)
             {
                 _tensor.allocator()->allocate();
                 ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
             }
 
-            if(pack_inject)
+            if (pack_inject)
             {
                 pack.add_tensor(slot_id, &_tensor);
                 _injected_tensor_pack = &pack;
@@ -70,22 +71,21 @@
         }
     }
 
-    CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor)
-        : _tensor()
+    CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) : _tensor()
     {
         _tensor.allocator()->soft_init(info);
-        if(info.total_size() <= tensor.info()->total_size())
+        if (info.total_size() <= tensor.info()->total_size())
         {
             _tensor.allocator()->import_memory(tensor.buffer());
         }
     }
 
-    CpuAuxTensorHandler(const CpuAuxTensorHandler &) = delete;
+    CpuAuxTensorHandler(const CpuAuxTensorHandler &)          = delete;
     CpuAuxTensorHandler &operator=(const CpuAuxTensorHandler) = delete;
 
     ~CpuAuxTensorHandler()
     {
-        if(_injected_tensor_pack)
+        if (_injected_tensor_pack)
         {
             _injected_tensor_pack->remove_tensor(_injected_slot_id);
         }
@@ -103,9 +103,9 @@
 
 private:
     Tensor       _tensor{};
-    ITensorPack *_injected_tensor_pack{ nullptr };
-    int          _injected_slot_id{ TensorType::ACL_UNKNOWN };
+    ITensorPack *_injected_tensor_pack{nullptr};
+    int          _injected_slot_id{TensorType::ACL_UNKNOWN};
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */

diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
index 15a5632..9ca20fa 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.cpp

@@ -22,14 +22,15 @@
  * SOFTWARE.
  */
 #include "ClKernelRuntime.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/CLUtils.h"
 #ifdef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h"
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 #include "src/gpu/cl/ClKernelLibrary.h"
-
 #include "support/Cast.h"
 namespace arm_compute
 {
@@ -43,13 +44,12 @@
 {
     // Create kernel from kernel source string
     opencl::ClKernelLibrary &klib = opencl::ClKernelLibrary::get();
-    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(code.name(),
-                                                                                      code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
-                                                                                      // Each program contains exactly one kernel
-                                                                                      code.code(),
-                                                                                      klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
-                                                                                      code.build_options().options(),
-                                                                                      false /* Is source binary */));
+    _kernel                       = static_cast<cl::Kernel>(compile_ctx.create_kernel(
+                              code.name(),
+                              code.name(), // program name has to be provided to differentiate between different unfusable components' kernels.
+                              // Each program contains exactly one kernel
+                              code.code(), klib.kernel_path() /* Kernel path: Used in cases of embedded kernels */,
+                              code.build_options().options(), false /* Is source binary */));
 
     // Configure execution window
     IClKernel::configure_internal(code.window());
@@ -63,11 +63,15 @@
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 
-inline void ClKernelRuntime::add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images)
+inline void ClKernelRuntime::add_tensor_argument(unsigned int                &idx,
+                                                 const GpuKernelArgumentInfo &arg,
+                                                 const ICLTensor             *tensor,
+                                                 const Window                &arg_slice,
+                                                 std::vector<cl::Image2D>    &cl_images)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 
-    switch(arg.type)
+    switch (arg.type)
     {
         case GpuKernelArgumentInfo::Type::Scalar:
         {
@@ -95,9 +99,13 @@
         }
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
+                                                                            tensor->info()->dimension(2) *
+                                                                            tensor->info()->dimension(3));
             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+            cl::Image2D       tensor_image2d =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
+                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
             _kernel.setArg(idx++, tensor_image2d);
             break;
@@ -111,9 +119,13 @@
         }
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
         {
-            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) * tensor->info()->dimension(2) * tensor->info()->dimension(3));
+            const TensorShape shape2d(tensor->info()->dimension(0) / 4, tensor->info()->dimension(1) *
+                                                                            tensor->info()->dimension(2) *
+                                                                            tensor->info()->dimension(3));
             const size_t      image_row_pitch = tensor->info()->strides_in_bytes()[1];
-            cl::Image2D       tensor_image2d  = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d, tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+            cl::Image2D       tensor_image2d =
+                create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(), shape2d,
+                                           tensor->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
             _kernel.setArg(idx++, tensor_image2d);
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(tensor->info()->strides_in_bytes()[2]));
@@ -142,8 +154,9 @@
             const size_t image_h        = tensor->info()->tensor_shape().total_size_upper(1);
             const size_t image_stride_y = tensor->info()->strides_in_bytes()[1];
 
-            cl::Image2D tensor_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), tensor->cl_buffer(),
-                                                                    TensorShape(image_w, image_h), tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
+            cl::Image2D tensor_image2d = create_image2d_from_buffer(
+                CLKernelLibrary::get().context(), tensor->cl_buffer(), TensorShape(image_w, image_h),
+                tensor->info()->data_type(), image_stride_y, CLImage2DType::ReadOnly);
             cl_images.push_back(tensor_image2d);
 
             _kernel.setArg(idx++, tensor_image2d);
@@ -170,13 +183,16 @@
 }
 
 #else // ACL_INTERNAL_TEST_CKW_IN_DF
-inline void ClKernelRuntime::add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images)
+inline void ClKernelRuntime::add_kernel_argument(unsigned int                   &idx,
+                                                 const GpuKernelArgumentBinding &arg,
+                                                 const ICLTensor                *tensor,
+                                                 std::vector<cl::Image2D>       &cl_images)
 {
-    switch(arg.type())
+    switch (arg.type())
     {
         case GpuKernelArgumentBinding::Type::TensorStorage:
         {
-            switch(arg.tensor_storage_type())
+            switch (arg.tensor_storage_type())
             {
                 case TensorStorageType::ClBufferUint8Ptr:
                 {
@@ -238,7 +254,7 @@
         // CLImages created from tensor arguments. Need to be retained until enqueue
         std::vector<cl::Image2D> cl_images;
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-        for(auto id_arg : _arguments)
+        for (auto id_arg : _arguments)
         {
             const auto arg    = id_arg.second;
             auto       tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(id_arg.first));
@@ -248,7 +264,7 @@
         }
 
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-        for(const auto &arg : _arguments)
+        for (const auto &arg : _arguments)
         {
             auto tensor = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(arg.id()));
             ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
@@ -259,8 +275,7 @@
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), use_dummy_work_items);
-    }
-    while(skip_sliding_window && window.slide_window_slice_3D(slice));
+    } while (skip_sliding_window && window.slide_window_slice_3D(slice));
 }
 
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
index 92e7350..e78567e 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h

@@ -68,7 +68,11 @@
      * @param[in]     arg_slice Window the kernel will be run on
      * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
      */
-    inline void add_tensor_argument(unsigned int &idx, const GpuKernelArgumentInfo &arg, const ICLTensor *tensor, const Window &arg_slice, std::vector<cl::Image2D> &cl_images);
+    inline void add_tensor_argument(unsigned int                &idx,
+                                    const GpuKernelArgumentInfo &arg,
+                                    const ICLTensor             *tensor,
+                                    const Window                &arg_slice,
+                                    std::vector<cl::Image2D>    &cl_images);
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Set a kernel argument as part of a tensor
      *
@@ -77,7 +81,10 @@
      * @param[in]     tensor    Tensor of which the kernel argument @p arg is a part of
      * @param[out]    cl_images Extra cl images created from the tensor (will need to be retained until the kernel is enqueued)
      */
-    inline void add_kernel_argument(unsigned int &idx, const GpuKernelArgumentBinding &arg, const ICLTensor *tensor, std::vector<cl::Image2D> &cl_images);
+    inline void add_kernel_argument(unsigned int                   &idx,
+                                    const GpuKernelArgumentBinding &arg,
+                                    const ICLTensor                *tensor,
+                                    std::vector<cl::Image2D>       &cl_images);
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
 private:

diff --git a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
index cd21b10..ba39ff4 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
@@ -55,14 +56,14 @@
     {
         DataView() = default;
         DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)
-            : tensor{ tensor }, tensor_info{ tensor_info }, memory_info{ memory_info }
+            : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info}
         {
         }
-        ~DataView()                     = default;
-        DataView(const DataView &other) = default;
+        ~DataView()                                = default;
+        DataView(const DataView &other)            = default;
         DataView &operator=(const DataView &other) = default;
         DataView(DataView &&other)                 = default;
-        DataView &operator=(DataView &&other) = default;
+        DataView     &operator=(DataView &&other)  = default;
         CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */
         TensorInfo    tensor_info{}; /**< Associated tensor info */
         AuxMemoryInfo memory_info{}; /**< Memory requirement */
@@ -92,7 +93,7 @@
     {
         const auto t_id             = tensor_info.id();
         auto       find_tensor_pair = _owned_tensors.find(t_id);
-        if(find_tensor_pair != _owned_tensors.end())
+        if (find_tensor_pair != _owned_tensors.end())
         {
             return find_tensor_pair->second.get();
         }
@@ -107,7 +108,7 @@
     }
 
     std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};
-    std::vector<DataView> _tensors{};
+    std::vector<DataView>                                _tensors{};
 };
 /** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode
  *
@@ -120,12 +121,12 @@
  */
 Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)
 {
-    for(auto t_id : code.tensors())
+    for (auto t_id : code.tensors())
     {
         // Get tensor object
         const auto workload_arg  = code.query_tensor(t_id);
         ICLTensor *tensor_object = nullptr;
-        if(workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
+        if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)
         {
             // Create aux tensor CLTensor object
             const TensorInfo tensor_info = *workload_arg->tensor_info();
@@ -133,7 +134,7 @@
             const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;
             tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);
 
-            if(tensor_object == nullptr)
+            if (tensor_object == nullptr)
             {
                 return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");
             }
@@ -156,7 +157,7 @@
     ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)
     {
         auto tensor_pack = _tensor_packs.find(uwk_id);
-        if(tensor_pack != _tensor_packs.end())
+        if (tensor_pack != _tensor_packs.end())
         {
             return &(tensor_pack->second);
         }
@@ -173,7 +174,10 @@
         return _tensor_packs.at(uwk_id);
     }
 
-    friend Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors);
+    friend Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                                    const GpuWorkloadSourceCode   &code,
+                                    const std::vector<CLTensor *> &user_tensors,
+                                    const ClAuxTensors            &aux_tensors);
 
 private:
     /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id
@@ -197,19 +201,22 @@
  *
  * @return Status
  */
-Status create_tensor_lut(ClTensorLUT *tensor_lut, const GpuWorkloadSourceCode &code, const std::vector<CLTensor *> &user_tensors, const ClAuxTensors &aux_tensors)
+Status create_tensor_lut(ClTensorLUT                   *tensor_lut,
+                         const GpuWorkloadSourceCode   &code,
+                         const std::vector<CLTensor *> &user_tensors,
+                         const ClAuxTensors            &aux_tensors)
 {
     // Combine user tensors and aux tensors
     std::map<ITensorInfo::Id, CLTensor *> tensor_map;
-    for(auto tensor : user_tensors)
+    for (auto tensor : user_tensors)
     {
         const auto t_id = tensor->info()->id();
 
-        if(tensor_map.find(t_id) != tensor_map.end())
+        if (tensor_map.find(t_id) != tensor_map.end())
         {
             // In case of elementwise in-place: give another Id to the In/Out tensor when passed again
             std::vector<ITensorInfo::Id> ids;
-            for(auto &t : tensor_map)
+            for (auto &t : tensor_map)
             {
                 ids.push_back(t.first);
             }
@@ -221,11 +228,11 @@
             tensor_map[t_id] = tensor;
         }
     }
-    for(const auto &data : aux_tensors.get_tensors())
+    for (const auto &data : aux_tensors.get_tensors())
     {
         const auto t_id   = data.tensor_info.id();
         const auto tensor = data.tensor;
-        if(tensor_map.find(t_id) != tensor_map.end())
+        if (tensor_map.find(t_id) != tensor_map.end())
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");
         }
@@ -233,25 +240,25 @@
     }
 
     // Add tensor objects into corresponding tensor packs
-    for(auto id_tensor : tensor_map)
+    for (auto id_tensor : tensor_map)
     {
         const auto t_id          = id_tensor.first;
         const auto tensor_object = id_tensor.second;
-        if(tensor_object == nullptr)
+        if (tensor_object == nullptr)
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");
         }
-        if(tensor_object->allocator()->info().total_size() == 0U)
+        if (tensor_object->allocator()->info().total_size() == 0U)
         {
             return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");
         }
 
-        for(auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
+        for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id))
         {
             ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);
-            if(tensor_pack == nullptr)
+            if (tensor_pack == nullptr)
             {
-                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{ { t_id, tensor_object } });
+                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}});
             }
             else
             {
@@ -269,15 +276,14 @@
 {
     std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};
     std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};
-    bool                  _is_configured{ false };
-    bool                  _is_prepared{ false };
-    ClTensorLUT           _tensor_lut{};
-    ClAuxTensors          _aux_tensors{};
-    GpuWorkloadSourceCode _source_code{};
+    bool                                                       _is_configured{false};
+    bool                                                       _is_prepared{false};
+    ClTensorLUT                                                _tensor_lut{};
+    ClAuxTensors                                               _aux_tensors{};
+    GpuWorkloadSourceCode                                      _source_code{};
 };
 
-ClWorkloadRuntime::ClWorkloadRuntime()
-    : _impl{ std::make_unique<Implementation>() }
+ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique<Implementation>()}
 {
 }
 
@@ -286,18 +292,19 @@
 Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL, "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL,
+                                    "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");
     // Generate source code
     _impl->_source_code = sketch.implementation().generate_source_code();
     // Configure unit workload from source code
-    for(auto uwk_id : _impl->_source_code.unit_workloads())
+    for (auto uwk_id : _impl->_source_code.unit_workloads())
     {
         const auto work  = _impl->_source_code.query_unit_workload(uwk_id);
         const auto stage = work.stage().stage;
         auto       k     = std::make_unique<ClKernelRuntime>();
         k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());
 
-        switch(stage)
+        switch (stage)
         {
             case UnitWorkloadStage::Stage::Run:
             {
@@ -323,9 +330,9 @@
 
 void ClWorkloadRuntime::prepare()
 {
-    if(!_impl->_is_prepared)
+    if (!_impl->_is_prepared)
     {
-        for(auto &id_kernel_pair : _impl->_kernels_prep)
+        for (auto &id_kernel_pair : _impl->_kernels_prep)
         {
             const bool flush_queue = false;
             const auto uwk_id      = id_kernel_pair.first;
@@ -344,7 +351,7 @@
     const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);
     ARM_COMPUTE_RETURN_ON_ERROR(st);
     prepare();
-    for(auto &id_kernel_pair : _impl->_kernels)
+    for (auto &id_kernel_pair : _impl->_kernels)
     {
         // Flush the command queue on the last kernel
         const bool flush_queue = false;
@@ -358,7 +365,7 @@
 std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()
 {
     std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> aux_tensors;
-    for(const auto &data : _impl->_aux_tensors.get_tensors())
+    for (const auto &data : _impl->_aux_tensors.get_tensors())
     {
         aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info);
     }

diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
index 84fb279..7044b0e 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.cpp

@@ -30,14 +30,17 @@
 {
 namespace dynamic_fusion
 {
-void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component)
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component)
 {
     ARM_COMPUTE_ERROR_ON(tensor == nullptr);
 
     const auto *info    = tensor->info();
     const auto &strides = info->strides_in_bytes();
 
-    switch(component)
+    switch (component)
     {
         case TensorComponentType::OffsetFirstElement:
             kernel.setArg<cl_uint>(idx++, info->offset_first_element_in_bytes());

diff --git a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
index 4cbb157..306d547 100644
--- a/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h
+++ b/src/dynamic_fusion/runtime/gpu/cl/ckw_driver/GpuCkwKernelArgumentsHelpers.h

@@ -42,7 +42,10 @@
  * @param[in]     tensor    Tensor from which to access the tensor component.
  * @param[in]     component Tensor component to select such as tensor dimensions, strides, etc.
  */
-void cl_add_tensor_component_argument(cl::Kernel &kernel, unsigned int &idx, const ICLTensor *tensor, TensorComponentType component);
+void cl_add_tensor_component_argument(cl::Kernel         &kernel,
+                                      unsigned int       &idx,
+                                      const ICLTensor    *tensor,
+                                      TensorComponentType component);
 
 /** Add an OpenCL buffer object to the kernel's arguments at the specified index @p idx.
  *

diff --git a/src/dynamic_fusion/sketch/ArgumentPack.h b/src/dynamic_fusion/sketch/ArgumentPack.h
index f118d7d..3bf380b 100644
--- a/src/dynamic_fusion/sketch/ArgumentPack.h
+++ b/src/dynamic_fusion/sketch/ArgumentPack.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_ARGUMENTPACK
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include <unordered_map>
 #include <vector>
 
@@ -52,26 +53,21 @@
      */
     struct PackElement
     {
-        PackElement()                        = default;
-        PackElement(const PackElement &elem) = default;
+        PackElement()                                   = default;
+        PackElement(const PackElement &elem)            = default;
         PackElement &operator=(const PackElement &elem) = default;
         PackElement(PackElement &&elem)                 = default;
-        PackElement &operator=(PackElement &&elem) = default;
-        PackElement(Id id, T *tensor)
-            : id(id), tensor(tensor), ctensor(nullptr)
+        PackElement &operator=(PackElement &&elem)      = default;
+        PackElement(Id id, T *tensor) : id(id), tensor(tensor), ctensor(nullptr)
         {
         }
-        PackElement(Id id, const T *ctensor)
-            : id(id), tensor(nullptr), ctensor(ctensor)
+        PackElement(Id id, const T *ctensor) : id(id), tensor(nullptr), ctensor(ctensor)
         {
         }
 
-        Id       id{ ACL_UNKNOWN }; /**< Argument id within the pack */
-        T       *tensor{ nullptr }; /**< Non-const pointer to tensor-related object */
-        const T *ctensor
-        {
-            nullptr
-        }; /**< Const pointer to tensor-related object */
+        Id       id{ACL_UNKNOWN};  /**< Argument id within the pack */
+        T       *tensor{nullptr};  /**< Non-const pointer to tensor-related object */
+        const T *ctensor{nullptr}; /**< Const pointer to tensor-related object */
     };
 
 public:
@@ -88,10 +84,9 @@
     /** Allow instances of this class to be moved */
     ArgumentPack<T> &operator=(ArgumentPack<T> &&other) = default;
     /** Initializer list Constructor */
-    ArgumentPack(const std::initializer_list<PackElement> &l)
-        : _pack{}
+    ArgumentPack(const std::initializer_list<PackElement> &l) : _pack{}
     {
-        for(const auto &e : l)
+        for (const auto &e : l)
         {
             _pack[e.id] = e;
         }
@@ -134,7 +129,7 @@
     const T *get_const_tensor(Id id) const
     {
         auto it = _pack.find(id);
-        if(it != _pack.end())
+        if (it != _pack.end())
         {
             return it->second.ctensor != nullptr ? it->second.ctensor : it->second.tensor;
         }
@@ -171,10 +166,10 @@
     std::vector<T *> get_src_tensors()
     {
         std::vector<T *> src_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
         {
             auto tensor = get_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 src_tensors.push_back(tensor);
             }
@@ -188,10 +183,10 @@
     std::vector<const T *> get_const_src_tensors() const
     {
         std::vector<const T *> src_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_SRC); id <= static_cast<int>(TensorType::ACL_SRC_END); ++id)
         {
             auto tensor = get_const_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 src_tensors.push_back(tensor);
             }
@@ -205,10 +200,10 @@
     std::vector<T *> get_dst_tensors()
     {
         std::vector<T *> dst_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
         {
             auto tensor = get_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 dst_tensors.push_back(tensor);
             }
@@ -222,10 +217,10 @@
     std::vector<const T *> get_const_dst_tensors() const
     {
         std::vector<const T *> dst_tensors{};
-        for(int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
+        for (int id = static_cast<int>(TensorType::ACL_DST); id <= static_cast<int>(TensorType::ACL_DST_END); ++id)
         {
             auto tensor = get_const_tensor(static_cast<TensorType>(id));
-            if(tensor != nullptr)
+            if (tensor != nullptr)
             {
                 dst_tensors.push_back(tensor);
             }

diff --git a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
index 3a5657e..6f38165 100644
--- a/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp
+++ b/src/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.cpp

@@ -69,7 +69,8 @@
     return _depth_multiplier;
 }
 
-DepthwiseConv2dAttributes &DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type)
+DepthwiseConv2dAttributes &
+DepthwiseConv2dAttributes::dimension_rounding_type(const DimensionRoundingType &dimension_rounding_type)
 {
     _dimension_rounding_type = dimension_rounding_type;
     return *this;

diff --git a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
index c28791f..80f65f9 100644
--- a/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp
+++ b/src/dynamic_fusion/sketch/attributes/Pool2dAttributes.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
 #include "arm_compute/core/Size2D.h"
 
 namespace arm_compute

diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
index 226e1a2..0381717 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h

@@ -61,11 +61,10 @@
     /** Default constructor */
     GpuKernelArgumentInfo() = default;
     /** Constructor */
-    GpuKernelArgumentInfo(Type type)
-        : type{ type }
+    GpuKernelArgumentInfo(Type type) : type{type}
     {
     }
-    Type type{ Type::Tensor_4D_t_Buffer };
+    Type type{Type::Tensor_4D_t_Buffer};
 };
 bool operator==(const GpuKernelArgumentInfo &info0, const GpuKernelArgumentInfo &info1);
 /** Kernel argument information linked with its corresponding @ref ITensorInfo
@@ -79,10 +78,8 @@
      * @param[in] tensor_info     Associated @ref ITensorInfo
      * @param[in] kernel_arg_info Associated @ref GpuKernelArgumentInfo
      */
-    GpuKernelArgument(const ITensorInfo           &tensor_info,
-                      const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{ tensor_info },
-          _kernel_arg_info{ kernel_arg_info }
+    GpuKernelArgument(const ITensorInfo &tensor_info, const GpuKernelArgumentInfo &kernel_arg_info)
+        : _tensor_info{tensor_info}, _kernel_arg_info{kernel_arg_info}
     {
     }
     /** Get workload tensor id */
@@ -200,12 +197,12 @@
         TensorComponent /** @ref TensorComponentType */
     };
     GpuKernelArgumentBinding(ITensorInfo::Id id, TensorStorageType storage)
-        : _type{ Type::TensorStorage }, _id{ id }, _value{}
+        : _type{Type::TensorStorage}, _id{id}, _value{}
     {
         _value.tensor_storage_type = storage;
     }
     GpuKernelArgumentBinding(ITensorInfo::Id id, TensorComponentType component)
-        : _type{ Type::TensorComponent }, _id{ id }, _value{}
+        : _type{Type::TensorComponent}, _id{id}, _value{}
     {
         _value.tensor_component_type = component;
     }

diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
index 5a65ede..1a458c9 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.cpp

@@ -31,35 +31,31 @@
 {
 namespace dynamic_fusion
 {
-std::vector<DependencyGraph::TensorId> GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
+std::vector<DependencyGraph::TensorId>
+GpuKernelComponentGraph::get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
 {
     std::vector<DependencyGraph::TensorId> tensor_ids{};
-    std::transform(
-        std::begin(tensors), std::end(tensors),
-        std::back_inserter(tensor_ids),
-        [](const auto & t)
-    {
-        return t->id();
-    });
+    std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+                   [](const auto &t) { return t->id(); });
     return tensor_ids;
 }
 
 GpuKernelComponentGraph::GpuKernelComponentGraph(GpuWorkloadContext *context, GpuComponentServices *services)
-    : _context{ context }, _services{ services }, _components{}, _tensors{}, _dependency_graph{}
+    : _context{context}, _services{services}, _components{}, _tensors{}, _dependency_graph{}
 {
 }
 
 GpuKernelComponentStream GpuKernelComponentGraph::fuse(const MemoryDescriptorMap &mem_map) const
 {
-    GpuKernelComponentStream stream{ _context, _services, mem_map };
+    GpuKernelComponentStream stream{_context, _services, mem_map};
     const auto               op_seq = _dependency_graph.build_operators_sequence();
 
     stream.new_component_group();
-    for(auto op : op_seq)
+    for (auto op : op_seq)
     {
         const auto component = _components.at(op.op).get();
         const auto success   = stream.add_component(component);
-        if(!success) // Assume first failure was because the root component is unfusable
+        if (!success) // Assume first failure was because the root component is unfusable
         {
             stream.new_component_group();
             const auto success = stream.add_component(component);

diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
index 85c9b45..6f871a3 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h

@@ -70,21 +70,21 @@
      * @param[in] args Component arguments except for component id, which is auto-allocated
      */
     template <typename T, typename... Args>
-    void add_new_component(Args &&... args)
+    void add_new_component(Args &&...args)
     {
-        auto                      comp           = _services->component_factory().create<T>(std::forward<Args>(args)...);
-        ArgumentPack<ITensorInfo> tensors        = comp->tensors();
+        auto                      comp    = _services->component_factory().create<T>(std::forward<Args>(args)...);
+        ArgumentPack<ITensorInfo> tensors = comp->tensors();
         const auto                src_tensor_ids = get_tensor_ids(tensors.get_const_src_tensors());
         const auto                dst_tensor_ids = get_tensor_ids(tensors.get_const_dst_tensors());
-        bool                      success        = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
+        bool                      success = _dependency_graph.add_operator(comp->id(), src_tensor_ids, dst_tensor_ids);
         ARM_COMPUTE_UNUSED(success);
         ARM_COMPUTE_ERROR_ON(!success);
         _components[comp->id()] = std::move(comp);
-        for(auto t : tensors.get_const_src_tensors())
+        for (auto t : tensors.get_const_src_tensors())
         {
             _tensors[t->id()] = t;
         }
-        for(auto t : tensors.get_const_dst_tensors())
+        for (auto t : tensors.get_const_dst_tensors())
         {
             _tensors[t->id()] = t;
         }
@@ -99,11 +99,11 @@
 
 private:
     static std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors);
-    GpuWorkloadContext   *_context;
-    GpuComponentServices *_services;
+    GpuWorkloadContext                           *_context;
+    GpuComponentServices                         *_services;
     std::map<ComponentId, std::unique_ptr<IGpuKernelComponent>> _components;
     std::map<ITensorInfo::Id, const ITensorInfo *>              _tensors;
-    DependencyGraph _dependency_graph{};
+    DependencyGraph                                             _dependency_graph{};
 };
 } // namespace dynamic_fusion
 } // namespace experimental

diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
index 81c3f0c..5a6d125 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 #include <algorithm>
@@ -37,86 +38,87 @@
 {
 bool GpuKernelComponentGroup::add_component(ComponentPtr component)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(
-        _finalized, "The component group has been finalized and cannot be altered.");
+    ARM_COMPUTE_ERROR_ON_MSG(_finalized, "The component group has been finalized and cannot be altered.");
 
     // note: Constraint 1 is guaranteed as a precondition
     // Constraint 2
-    if(component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
+    if (component->type() != GpuComponentType::Output && _components.size() >= max_fused_components)
     {
         return false;
     }
     // Constraint 3.1: Pattern: (Unfusable + Output)
-    if(!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable && component->type() != GpuComponentType::Output)
+    if (!_components.empty() && get_root_component()->type() == GpuComponentType::Unfusable &&
+        component->type() != GpuComponentType::Output)
     {
         return false;
     }
     // Constraint 3.2
-    if(!_components.empty() && (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
+    if (!_components.empty() &&
+        (component->type() != GpuComponentType::Simple && component->type() != GpuComponentType::Output))
     {
         return false;
     }
     // Constraint 4
-    if(component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
+    if (component->type() != GpuComponentType::Unfusable && component->tensors().get_const_dst_tensors().size() != 1U)
     {
         return false;
     }
     // Constraint 5
-    if(!_components.empty() && !(get_root_component()->properties() == component->properties()))
+    if (!_components.empty() && !(get_root_component()->properties() == component->properties()))
     {
         return false;
     }
     // Constraint 7
-    if(!_components.empty())
+    if (!_components.empty())
     {
         const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor = root_dst_tensors[0];
         const auto dst_tensors      = component->tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
     }
     // Constraint 8
-    if(!_components.empty())
+    if (!_components.empty())
     {
         const auto root_dst_tensors = get_root_component()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
         const auto dst_tensors             = component->tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
         }
     }
     // Constraint 9
-    if(component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
+    if (component->tensors().get_const_dst_tensors().size() >= max_dst_tensors)
     {
         return false;
     }
     // Constraint 9 corollary
-    if(component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
+    if (component->type() == GpuComponentType::Output && _components.size() >= max_fused_components + max_dst_tensors)
     {
         return false;
     }
@@ -126,36 +128,36 @@
 
 void GpuKernelComponentGroup::finalize()
 {
-    if(_finalized)
+    if (_finalized)
     {
         return;
     }
 
     _finalized = true;
 
-    std::set<const ITensorInfo *> output_tensors;
+    std::set<const ITensorInfo *>                                   output_tensors;
     std::map<const ITensorInfo *, std::vector<const ITensorInfo *>> possible_tile_map;
-    std::map<const ITensorInfo *, int32_t> tile_usages;
+    std::map<const ITensorInfo *, int32_t>                          tile_usages;
 
-    for(auto component : _components)
+    for (auto component : _components)
     {
-        const auto tensors = component->tensors();
+        const auto tensors     = component->tensors();
         const auto src_tensors = tensors.get_const_src_tensors();
         const auto dst_tensors = tensors.get_const_dst_tensors();
 
         // Detect input, output and intermediate tensors.
-        for(auto tensor : src_tensors)
+        for (auto tensor : src_tensors)
         {
             const auto output_tensors_it = output_tensors.find(tensor);
 
-            if(output_tensors_it != output_tensors.end())
+            if (output_tensors_it != output_tensors.end())
             {
                 // This tensor is the output of another operator.
                 // It must be marked as intermediate tensor.
                 output_tensors.erase(output_tensors_it);
                 _interm_tensors.insert(tensor);
             }
-            else if(_interm_tensors.find(tensor) == _interm_tensors.end())
+            else if (_interm_tensors.find(tensor) == _interm_tensors.end())
             {
                 _input_tensors.insert(tensor);
 
@@ -164,7 +166,7 @@
             }
         }
 
-        for(auto tensor : dst_tensors)
+        for (auto tensor : dst_tensors)
         {
             ARM_COMPUTE_ERROR_ON(_input_tensors.find(tensor) != _input_tensors.end());
             ARM_COMPUTE_ERROR_ON(output_tensors.find(tensor) != output_tensors.end());
@@ -177,27 +179,27 @@
 
         // Check if the output can overwrite the input tile.
         const auto component_type = component->type();
-        if(component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
+        if (component_type == GpuComponentType::Simple || component_type == GpuComponentType::Output)
         {
             ARM_COMPUTE_ERROR_ON(dst_tensors.size() != 1);
 
-            const auto dst_tensor = dst_tensors[0];
-            const auto &dst_shape = dst_tensor->tensor_shape();
-            const auto &dst_type = dst_tensor->data_type();
+            const auto  dst_tensor = dst_tensors[0];
+            const auto &dst_shape  = dst_tensor->tensor_shape();
+            const auto &dst_type   = dst_tensor->data_type();
 
             tile_usages[dst_tensor] = 0;
 
-            for(auto src_tensor : src_tensors)
+            for (auto src_tensor : src_tensors)
             {
                 const auto &src_shape = src_tensor->tensor_shape();
-                const auto &src_type = src_tensor->data_type();
+                const auto &src_type  = src_tensor->data_type();
 
-                if(src_shape == dst_shape && src_type == dst_type)
+                if (src_shape == dst_shape && src_type == dst_type)
                 {
                     const auto tile_usages_it = tile_usages.find(src_tensor);
                     ARM_COMPUTE_ERROR_ON(tile_usages_it == tile_usages.end());
 
-                    if(component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
+                    if (component_type == GpuComponentType::Simple || tile_usages_it->second > 0)
                     {
                         // Increase the number of tile usages unless this component is an output
                         // and the tile has not been shared with any component.
@@ -212,7 +214,7 @@
         else
         {
             // Outputs of complex and unfusable components need dedicated tile.
-            for(auto tensor : dst_tensors)
+            for (auto tensor : dst_tensors)
             {
                 tile_usages[tensor] = 0;
             }
@@ -220,25 +222,25 @@
     }
 
     // Find the smallest list of tiles that the intermediate tensors need to write to.
-    for(auto tensor : _input_tensors)
+    for (auto tensor : _input_tensors)
     {
         _tile_map[tensor] = tensor;
     }
 
-    for(auto component : _components)
+    for (auto component : _components)
     {
         const auto dst_tensors = component->tensors().get_const_dst_tensors();
 
-        for(auto tensor : dst_tensors)
+        for (auto tensor : dst_tensors)
         {
             const auto target_tiles = possible_tile_map.at(tensor);
-            _tile_map[tensor] = tensor;
+            _tile_map[tensor]       = tensor;
 
-            for(auto target : target_tiles)
+            for (auto target : target_tiles)
             {
                 const auto num_usage = tile_usages[target];
 
-                if(num_usage <= 1)
+                if (num_usage <= 1)
                 {
                     // The target tile is consumed by only this operator, so we can reuse it
                     // for the destination tensor data.
@@ -249,26 +251,23 @@
         }
     }
 
-    for(auto tensor : output_tensors)
+    for (auto tensor : output_tensors)
     {
         _tile_map[tensor] = tensor;
     }
 
     // All intermediate tensors that cannot be shared with any previous tensor
     // will need to be declared as tile variable.
-    for(auto tensor_tile : _tile_map)
+    for (auto tensor_tile : _tile_map)
     {
-        if(tensor_tile.first == tensor_tile.second &&
-           _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
+        if (tensor_tile.first == tensor_tile.second && _interm_tensors.find(tensor_tile.first) != _interm_tensors.end())
         {
             _tiles.push_back(tensor_tile.first);
         }
     }
 
-    std::set_union(
-        _input_tensors.begin(), _input_tensors.end(),
-        output_tensors.begin(), output_tensors.end(),
-        std::back_inserter(_argument_tensors));
+    std::set_union(_input_tensors.begin(), _input_tensors.end(), output_tensors.begin(), output_tensors.end(),
+                   std::back_inserter(_argument_tensors));
     _any_output_tensor = *output_tensors.begin();
 }
 
@@ -282,7 +281,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_finalized, "The component group must have been finalized.");
 
-    if(_tile_map.find(tensor) != _tile_map.end())
+    if (_tile_map.find(tensor) != _tile_map.end())
     {
         return _tile_map.at(tensor);
     }
@@ -304,7 +303,7 @@
 
 GpuKernelComponentGroup::ComponentPtr GpuKernelComponentGroup::get_root_component() const
 {
-    if(empty())
+    if (empty())
     {
         return nullptr;
     }

diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
index c939aec..6ad71ab 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h

@@ -25,12 +25,11 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTGROUP
 
 #include "components/Types.h"
-
 #include <cstdint>
 #include <cstdlib>
-#include <vector>
-#include <set>
 #include <map>
+#include <set>
+#include <vector>
 
 namespace arm_compute
 {
@@ -129,9 +128,9 @@
     /** Get the number of components within the group */
     size_t size() const;
     /** Check if the component group is empty */
-    bool empty() const;
-    ComponentPtr &operator[](size_t index);
-    const ComponentPtr &operator[](size_t index) const;
+    bool                                               empty() const;
+    ComponentPtr                                      &operator[](size_t index);
+    const ComponentPtr                                &operator[](size_t index) const;
     typename std::vector<ComponentPtr>::iterator       begin();
     typename std::vector<ComponentPtr>::iterator       end();
     typename std::vector<ComponentPtr>::const_iterator begin() const;
@@ -142,13 +141,13 @@
 private:
     std::vector<ComponentPtr> _components{};
 
-    bool _finalized{ false };
+    bool _finalized{false};
 
-    std::vector<const ITensorInfo *> _argument_tensors{};
-    std::set<const ITensorInfo *> _input_tensors{};
-    std::set<const ITensorInfo *> _interm_tensors{};
-    const ITensorInfo *_any_output_tensor{ nullptr };
-    std::vector<const ITensorInfo *> _tiles{};
+    std::vector<const ITensorInfo *>                   _argument_tensors{};
+    std::set<const ITensorInfo *>                      _input_tensors{};
+    std::set<const ITensorInfo *>                      _interm_tensors{};
+    const ITensorInfo                                 *_any_output_tensor{nullptr};
+    std::vector<const ITensorInfo *>                   _tiles{};
     std::map<const ITensorInfo *, const ITensorInfo *> _tile_map{};
 };
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
index a2b6623..8042e3d 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.cpp

@@ -23,9 +23,9 @@
  */
 #include "GpuKernelComponentStream.h"
 
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
 {
@@ -33,8 +33,10 @@
 {
 namespace dynamic_fusion
 {
-GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map)
-    : _context{ context }, _services{ services }, _component_groups{}, _mem_map{ mem_map }
+GpuKernelComponentStream::GpuKernelComponentStream(GpuWorkloadContext        *context,
+                                                   GpuComponentServices      *services,
+                                                   const MemoryDescriptorMap &mem_map)
+    : _context{context}, _services{services}, _component_groups{}, _mem_map{mem_map}
 {
 }
 
@@ -42,7 +44,7 @@
 {
     GpuWorkloadSourceCode source_code;
     // Traverse through component groups and assemble workload together
-    for(auto && group : _component_groups)
+    for (auto &&group : _component_groups)
     {
         group.finalize();
 

diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
index ba2503a..ef8a8a1 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelComponentStream.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUKERNELCOMPONENTSTREAM
 
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
 
@@ -53,7 +54,9 @@
      * @param[in] services @ref GpuComponentServices to be used throughout the stream
      * @param[in] mem_map  @ref MemoryDescriptor map used to assemble the @ref GpuWorkloadSourceCode
      */
-    GpuKernelComponentStream(GpuWorkloadContext *context, GpuComponentServices *services, const MemoryDescriptorMap &mem_map);
+    GpuKernelComponentStream(GpuWorkloadContext        *context,
+                             GpuComponentServices      *services,
+                             const MemoryDescriptorMap &mem_map);
     /** Allow instances of this class to be copy constructed */
     GpuKernelComponentStream(const GpuKernelComponentStream &stream) = default;
     /** Allow instances of this class to be copied */

diff --git a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
index 64e1cdc..24812cd 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF

diff --git a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
index c99984f..502ceab 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuLogicalKernel.cpp

@@ -26,9 +26,9 @@
 #include "arm_compute/core/experimental/Types.h"
 
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.h"
 #else // ACL_INTERNAL_TEST_CKW_IN_DF
@@ -42,7 +42,7 @@
 namespace dynamic_fusion
 {
 GpuLogicalKernel::GpuLogicalKernel(GpuComponentServices *services, const GpuKernelComponentGroup &components)
-    : _comp_group{ components }, _store_components{}
+    : _comp_group{components}, _store_components{}
 {
     ARM_COMPUTE_UNUSED(services);
 }
@@ -51,9 +51,9 @@
 {
     GpuKernelSourceCode code;
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    ClTemplateWriter writer { _comp_group };
+    ClTemplateWriter writer{_comp_group};
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuCkwDriver writer { _comp_group };
+    GpuCkwDriver writer{_comp_group};
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
     code.name(writer.get_name());

diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
index 7bb14c8..aec8b9d 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.cpp

@@ -36,20 +36,15 @@
 std::vector<DependencyGraph::TensorId> get_tensor_ids(const std::vector<const ITensorInfo *> tensors)
 {
     std::vector<DependencyGraph::TensorId> tensor_ids{};
-    std::transform(
-        std::begin(tensors), std::end(tensors),
-        std::back_inserter(tensor_ids),
-        [](const auto & t)
-    {
-        return t->id();
-    });
+    std::transform(std::begin(tensors), std::end(tensors), std::back_inserter(tensor_ids),
+                   [](const auto &t) { return t->id(); });
     return tensor_ids;
 }
 
 } // namespace
 
 Operator::Operator(OperatorId id, GpuOperatorType operator_type, const ArgumentPack<ITensorInfo> &tensors)
-    : _id{ id }, _operator_type{ operator_type }, _tensors{ tensors }
+    : _id{id}, _operator_type{operator_type}, _tensors{tensors}
 {
 }
 
@@ -73,69 +68,69 @@
     const auto src_tensor_ids = get_tensor_ids(op.tensors().get_const_src_tensors());
     const auto dst_tensor_ids = get_tensor_ids(op.tensors().get_const_dst_tensors());
     // Constraint 1
-    if(!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
+    if (!_graph.try_add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output))
     {
         return false;
     }
     // Constraint 2
-    if(_operators.size() >= max_fused_operators)
+    if (_operators.size() >= max_fused_operators)
     {
         return false;
     }
     // Constraint 3.1: Pattern: (Unfusable)
-    if(_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
+    if (_operators.size() > 0 && get_root_operator()->operator_type() == GpuOperatorType::Unfusable)
     {
         return false;
     }
     // Constraint 3.2
-    if(_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
+    if (_operators.size() > 0 && (op.operator_type() != GpuOperatorType::Simple))
     {
         return false;
     }
     // Constraint 4
-    if(op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
+    if (op.operator_type() != GpuOperatorType::Unfusable && op.tensors().get_const_dst_tensors().size() != 1U)
     {
         return false;
     }
     // Constraint 5
-    if(_operators.size() > 0)
+    if (_operators.size() > 0)
     {
         const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor = root_dst_tensors[0];
         const auto dst_tensors      = op.tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
+            if (detail::have_different_dimensions(t->tensor_shape(), first_dst_tensor->tensor_shape(), 0))
             {
                 return false;
             }
         }
     }
     // Constraint 6
-    if(_operators.size() > 0)
+    if (_operators.size() > 0)
     {
         const auto root_dst_tensors = get_root_operator()->tensors().get_const_dst_tensors();
         ARM_COMPUTE_ERROR_ON(root_dst_tensors.empty());
         const auto first_dst_tensor_layout = root_dst_tensors[0]->data_layout();
         const auto dst_tensors             = op.tensors().get_const_dst_tensors();
-        for(const auto &t : root_dst_tensors)
+        for (const auto &t : root_dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
         }
-        for(const auto &t : dst_tensors)
+        for (const auto &t : dst_tensors)
         {
-            if(t->data_layout() != first_dst_tensor_layout)
+            if (t->data_layout() != first_dst_tensor_layout)
             {
                 return false;
             }
@@ -151,16 +146,17 @@
     _graph.add_operator_as_linear(op.id(), src_tensor_ids, dst_tensor_ids, is_output);
     _operators[op.id()] = op;
 }
-Operator GpuOperatorGroup::new_operator(const GpuOperatorType &operator_type, const ArgumentPack<ITensorInfo> &tensors) const
+Operator GpuOperatorGroup::new_operator(const GpuOperatorType           &operator_type,
+                                        const ArgumentPack<ITensorInfo> &tensors) const
 {
     auto new_id = static_cast<OperatorId>(_operators.size());
-    return Operator{ new_id, operator_type, tensors };
+    return Operator{new_id, operator_type, tensors};
 }
 const Operator *GpuOperatorGroup::get_root_operator() const
 {
     const auto roots = _graph.get_root_ops();
     ARM_COMPUTE_ERROR_ON(roots.size() > 1);
-    if(roots.empty())
+    if (roots.empty())
     {
         return nullptr;
     }

diff --git a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
index 308a9d7..0a2369d 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h

@@ -25,9 +25,11 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUOPERATORGROUP
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
 #include "src/dynamic_fusion/sketch/utils/DependencyGraph.h"
+
 #include <map>
 
 namespace arm_compute
@@ -104,7 +106,7 @@
     const Operator *get_root_operator() const;
 
 private:
-    DependencyGraph _graph{};
+    DependencyGraph                _graph{};
     std::map<OperatorId, Operator> _operators{};
 };
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
index c2bd012..36cad79 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContext.cpp

@@ -23,7 +23,9 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+
 #include "arm_compute/core/CL/CLCompileContext.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
 namespace arm_compute
@@ -33,7 +35,7 @@
 namespace dynamic_fusion
 {
 GpuWorkloadContext::GpuWorkloadContext(CLCompileContext *cl_compile_ctx)
-    : _impl{ std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx) }
+    : _impl{std::make_unique<Impl>(GpuLanguage::OpenCL, cl_compile_ctx)}
 {
 }
 
@@ -74,7 +76,11 @@
 }
 
 GpuWorkloadContext::Impl::Impl(GpuLanguage gpu_language, CLCompileContext *cl_compile_ctx)
-    : _gpu_language(gpu_language), _cl_compile_ctx(cl_compile_ctx), _next_tensor_id(1), _mem_map(), _managed_tensor_info()
+    : _gpu_language(gpu_language),
+      _cl_compile_ctx(cl_compile_ctx),
+      _next_tensor_id(1),
+      _mem_map(),
+      _managed_tensor_info()
 {
 }
 
@@ -100,7 +106,7 @@
     const auto tensor_id = next_tensor_id();
 
     tensor_info.set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::User };
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::User};
     // Save a *copy* of the user tensor info in workload context for future reference
     // Note that this means if the user modifies the @p tensor_info, the change will not be reflected in the context
     _managed_tensor_info.emplace(tensor_info.id(), std::make_unique<TensorInfo>(tensor_info));
@@ -111,7 +117,7 @@
     auto       tensor_info = std::make_unique<TensorInfo>();
     const auto tensor_id   = -next_tensor_id();
     tensor_info->set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Virtual };
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Virtual};
     auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
     return inserted.first->second.get();
 }
@@ -121,7 +127,7 @@
     auto       tensor_info = std::make_unique<TensorInfo>(itensor_info);
     const auto tensor_id   = next_tensor_id();
     tensor_info->set_id(tensor_id);
-    _mem_map[tensor_id] = MemoryDescriptor{ MemoryType::Auxiliary, AuxMemoryInfo{ tensor_info->total_size() } };
+    _mem_map[tensor_id] = MemoryDescriptor{MemoryType::Auxiliary, AuxMemoryInfo{tensor_info->total_size()}};
     auto inserted       = _managed_tensor_info.emplace(tensor_info->id(), std::move(tensor_info));
     return inserted.first->second.get();
 }

diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
index c169476..7d96990 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h

@@ -27,8 +27,8 @@
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/ITensorInfo.h"
-#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 
 namespace arm_compute
 {
@@ -93,8 +93,8 @@
     GpuLanguage       _gpu_language;
     CLCompileContext *_cl_compile_ctx;
 
-    ITensorInfo::Id     _next_tensor_id;
-    MemoryDescriptorMap _mem_map;
+    ITensorInfo::Id                                        _next_tensor_id;
+    MemoryDescriptorMap                                    _mem_map;
     std::map<ITensorInfo::Id, std::unique_ptr<TensorInfo>> _managed_tensor_info;
 };
 

diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
index d3a20c0..973f7c7 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
@@ -30,8 +31,7 @@
 {
 namespace dynamic_fusion
 {
-GpuWorkloadSketch::GpuWorkloadSketch(Context *context)
-    : _impl{ std::make_unique<Implementation>(context) }
+GpuWorkloadSketch::GpuWorkloadSketch(Context *context) : _impl{std::make_unique<Implementation>(context)}
 {
 }
 GpuWorkloadSketch::~GpuWorkloadSketch()

diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
index d303389..fea4fe9 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h

@@ -24,8 +24,9 @@
 #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_GPUWORKLOADSKETCHIMPL
 
-#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
+#include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuComponentServices.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGraph.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorGroup.h"
@@ -45,12 +46,8 @@
      *
      * @param[in] context global workload creation context
      */
-    explicit Implementation(
-        Context *context)
-        : _context{ context },
-          _comp_services{},
-          _component_graph{ _context, &_comp_services },
-          _operator_group{}
+    explicit Implementation(Context *context)
+        : _context{context}, _comp_services{}, _component_graph{_context, &_comp_services}, _operator_group{}
     {
     }
     /** Prevent instances of this class from being copy constructed */

diff --git a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
index 578366d..43bcc47 100644
--- a/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h
+++ b/src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/MemoryDescriptor.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadContextImpl.h"
 
@@ -45,7 +46,7 @@
  */
 GpuKernelArgumentList extract_kernel_args_for_one_tensor(GpuKernelArgumentList &flat_kernel_args)
 {
-    if(flat_kernel_args.empty())
+    if (flat_kernel_args.empty())
     {
         return {};
     }
@@ -56,10 +57,10 @@
     flat_kernel_args.pop_front();
     const auto tensor_id = karg_head.id();
 
-    while(!flat_kernel_args.empty())
+    while (!flat_kernel_args.empty())
     {
         const GpuKernelArgumentBinding &karg = flat_kernel_args.front();
-        if(karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
+        if (karg.id() != tensor_id) // Encounter the next tensor, return the current tensor's kernel arguments
         {
             return tensor_kargs;
         }
@@ -68,7 +69,7 @@
     }
     return tensor_kargs;
 }
-}
+} // namespace
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 /** Uniquely identifies a @ref GpuUnitWorkload within a @ref GpuWorkloadSourceCode */
 using UnitWorkloadId = int32_t;
@@ -92,9 +93,7 @@
     GpuWorkloadArgument(const ITensorInfo           &tensor_info,
                         const MemoryDescriptor      &mem_desc,
                         const GpuKernelArgumentInfo &kernel_arg_info)
-        : _tensor_info{ tensor_info },
-          _mem_desc{ mem_desc },
-          _kernel_arg_info{ kernel_arg_info }
+        : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_arg_info{kernel_arg_info}
     {
     }
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
@@ -107,9 +106,7 @@
     GpuWorkloadArgument(const ITensorInfo           &tensor_info,
                         const MemoryDescriptor      &mem_desc,
                         const GpuKernelArgumentList &kernel_args)
-        : _tensor_info{ tensor_info },
-          _mem_desc{ mem_desc },
-          _kernel_args{ kernel_args }
+        : _tensor_info{tensor_info}, _mem_desc{mem_desc}, _kernel_args{kernel_args}
     {
     }
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
@@ -175,9 +172,9 @@
     TensorInfo       _tensor_info{};
     MemoryDescriptor _mem_desc{};
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuKernelArgumentInfo _kernel_arg_info {};
+    GpuKernelArgumentInfo _kernel_arg_info{};
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    GpuKernelArgumentList     _kernel_args {};
+    GpuKernelArgumentList _kernel_args{};
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };
 
@@ -190,7 +187,7 @@
         Prepare, /**< Only run once at the beginning. */
         Run,     /**< Run every time after the first time. */
     };
-    Stage stage{ Stage::Run };
+    Stage stage{Stage::Run};
 };
 
 inline bool operator==(const UnitWorkloadStage &stage0, const UnitWorkloadStage &stage1)
@@ -212,7 +209,7 @@
      * @param[in] stage       Stage of the unit workload
      */
     GpuUnitWorkload(UnitWorkloadId id, const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage)
-        : _id{ id }, _kernel_code{ kernel_code }, _stage{ stage }
+        : _id{id}, _kernel_code{kernel_code}, _stage{stage}
     {
     }
     /** Get the id of the unit workload */
@@ -253,7 +250,10 @@
      *
      * @return UnitWorkloadId  Allocated unit workload id
      */
-    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code, const UnitWorkloadStage &stage, const MemoryDescriptorMap &mem_map, const GpuWorkloadContext *context)
+    UnitWorkloadId add_unit_workload(const GpuKernelSourceCode &kernel_code,
+                                     const UnitWorkloadStage   &stage,
+                                     const MemoryDescriptorMap &mem_map,
+                                     const GpuWorkloadContext  *context)
     {
         // Use the size of the kernel codes as Id
         const auto uwk_id    = static_cast<UnitWorkloadId>(_unit_workloads.size());
@@ -262,12 +262,13 @@
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
         ARM_COMPUTE_UNUSED(context);
         // Assemble kernel argument with memory descriptor to form workload argument
-        for(const auto &id_arg : kernel_code.arguments())
+        for (const auto &id_arg : kernel_code.arguments())
         {
-            const auto arg_id           = id_arg.first;
-            const auto arg              = id_arg.second;
-            _workload_arguments[arg_id] = GpuWorkloadArgument{ *arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info() };
-            if(_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
+            const auto arg_id = id_arg.first;
+            const auto arg    = id_arg.second;
+            _workload_arguments[arg_id] =
+                GpuWorkloadArgument{*arg.tensor_info(), mem_map.at(arg_id), *arg.kernel_argument_info()};
+            if (_tensor_uwork_map.find(arg_id) == _tensor_uwork_map.end())
             {
                 _tensor_uwork_map[arg_id] = std::set<UnitWorkloadId>();
             }
@@ -276,18 +277,19 @@
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
         GpuKernelArgumentList flat_kernel_args = kernel_code.arguments();
         GpuKernelArgumentList tensor_kargs{};
-        while(true)
+        while (true)
         {
             tensor_kargs = extract_kernel_args_for_one_tensor(flat_kernel_args);
-            if(tensor_kargs.empty())
+            if (tensor_kargs.empty())
             {
                 break;
             }
             else
             {
                 const auto tensor_id           = tensor_kargs.at(0).id();
-                _workload_arguments[tensor_id] = GpuWorkloadArgument{ *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs };
-                if(_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
+                _workload_arguments[tensor_id] = GpuWorkloadArgument{
+                    *context->implementation().get_tensor_info(tensor_id), mem_map.at(tensor_id), tensor_kargs};
+                if (_tensor_uwork_map.find(tensor_id) == _tensor_uwork_map.end())
                 {
                     _tensor_uwork_map[tensor_id] = std::set<UnitWorkloadId>();
                 }
@@ -308,7 +310,7 @@
     {
         std::vector<UnitWorkloadId> ids{};
 
-        for(const auto &uwk : _unit_workloads)
+        for (const auto &uwk : _unit_workloads)
         {
             ids.push_back(uwk.id());
         }
@@ -323,7 +325,7 @@
     std::vector<ITensorInfo::Id> tensors() const
     {
         std::vector<ITensorInfo::Id> ids{};
-        for(const auto &id_tensor : _workload_arguments)
+        for (const auto &id_tensor : _workload_arguments)
         {
             ids.push_back(id_tensor.first);
         }
@@ -337,7 +339,7 @@
     }
 
 private:
-    std::vector<GpuUnitWorkload> _unit_workloads{};
+    std::vector<GpuUnitWorkload>                        _unit_workloads{};
     std::map<ITensorInfo::Id, GpuWorkloadArgument>      _workload_arguments{};
     std::map<ITensorInfo::Id, std::set<UnitWorkloadId>> _tensor_uwork_map{};
 };

diff --git a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
index 1d8b231..ad47467 100644
--- a/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelSourceCode.h"
 

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
index 4b4c22f..c4ab110 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
 #include "ckw/Error.h"
 
 namespace arm_compute
@@ -36,12 +37,12 @@
 {
 }
 
-GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor)
-    : _tensor(&tensor)
+GpuCkwComponentArgument::GpuCkwComponentArgument(ckw::TensorOperand &tensor) : _tensor(&tensor)
 {
 }
 
-GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand &tile, const ckw::TensorTileSampler &tile_sampler)
+GpuCkwComponentArgument &GpuCkwComponentArgument::init_virtual_tensor(ckw::TileOperand             &tile,
+                                                                      const ckw::TensorTileSampler &tile_sampler)
 {
     CKW_ASSERT(_tile == nullptr);
 

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
index 80f9138..863989a 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h

@@ -110,9 +110,9 @@
     const ckw::TensorTileSampler &tile_sampler() const;
 
 private:
-    ckw::TensorOperand *_tensor{ nullptr };
-    ckw::TileOperand   *_tile{ nullptr };
-    ckw::TensorTileSampler  _tile_sampler{};
+    ckw::TensorOperand    *_tensor{nullptr};
+    ckw::TileOperand      *_tile{nullptr};
+    ckw::TensorTileSampler _tile_sampler{};
 };
 
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
index a24a172..c927f32 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.cpp

@@ -23,17 +23,16 @@
  */
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h"
 
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Window.h"
-#include "src/common/utils/Log.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 
+#include "src/common/utils/Log.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 using namespace ckw;
 namespace arm_compute
@@ -43,11 +42,11 @@
 namespace dynamic_fusion
 {
 GpuCkwDriver::GpuCkwDriver(const GpuKernelComponentGroup &components)
-    : _components{ components }, _kernel{ GpuTargetLanguage::OpenCL }, _code{}
+    : _components{components}, _kernel{GpuTargetLanguage::OpenCL}, _code{}
 {
     // Generate kernel name
     std::string name = "";
-    for(auto &comp : _components)
+    for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
@@ -60,7 +59,7 @@
     GpuCkwScopedKernelWriter writer(&root_writer);
     GpuCkwVariableTable      vtable{};
 
-    for(auto &comp : _components)
+    for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
@@ -82,7 +81,7 @@
 std::string GpuCkwDriver::get_config_id()
 {
     std::string id = "";
-    for(auto &comp : _components)
+    for (auto &comp : _components)
     {
         auto ckw_driver = comp->ckw_component_driver();
         ARM_COMPUTE_ERROR_ON(ckw_driver == nullptr);
@@ -101,9 +100,9 @@
 GpuKernelArgumentList GpuCkwDriver::get_kernel_arguments()
 {
     GpuKernelArgumentList args{};
-    for(const auto &arg : _kernel.arguments())
+    for (const auto &arg : _kernel.arguments())
     {
-        switch(arg.type())
+        switch (arg.type())
         {
             case KernelArgument::Type::TensorStorage:
             {

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
index 19db575..2ca5fb4 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwDriver.h

@@ -24,12 +24,12 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWDRIVER
 
+#include "ckw/Kernel.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "src/dynamic_fusion/sketch/gpu/IGpuKernelWriter.h"
 
-#include "ckw/Kernel.h"
-
 #include <map>
 #include <string>
 

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
index ca4f121..5f8ce91 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.cpp

@@ -23,10 +23,12 @@
  */
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
 #include "ckw/Error.h"
 #include "ckw/TileInfo.h"
 
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -34,21 +36,21 @@
 namespace dynamic_fusion
 {
 
-GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel)
-    : KernelWriter(kernel)
+GpuCkwKernelWriter::GpuCkwKernelWriter(ckw::Kernel &kernel) : KernelWriter(kernel)
 {
 }
 
 void GpuCkwKernelWriter::op_load_once(GpuCkwComponentArgument *tensor_or_tile, const ckw::TensorTileSampler &sampler)
 {
-    if(!tensor_or_tile->has_tile())
+    if (!tensor_or_tile->has_tile())
     {
         CKW_ASSERT(tensor_or_tile->has_tensor());
 
         auto &tensor = tensor_or_tile->tensor();
 
         const auto tile_name = tensor.name() + "_tile";
-        auto      &tile      = declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
+        auto      &tile =
+            declare_tile(tile_name.c_str(), ckw::TileInfo(tensor.data_type(), sampler.height(), sampler.width()));
 
         op_load(tile, tensor, sampler);
 

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
index 043fda9..cbadbd9 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.cpp

@@ -23,6 +23,7 @@
  */
 
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 
 namespace arm_compute

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
index 4d11b5e..81049bf 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h

@@ -63,7 +63,7 @@
 
 private:
     GpuCkwKernelWriter *_writer;
-    int32_t          _parent_id_space;
+    int32_t             _parent_id_space;
 };
 
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
index 37c27cd..88a0cf7 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.cpp

@@ -23,11 +23,12 @@
  */
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
 #include <sstream>
 
 namespace arm_compute
@@ -36,19 +37,22 @@
 {
 namespace dynamic_fusion
 {
-GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
-                                                               const std::string &alias)
+GpuCkwComponentArgument *GpuCkwVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
+                                                               GpuCkwScopedKernelWriter      &writer,
+                                                               const ITensorInfo             *tensor,
+                                                               TensorStorageType              storage,
+                                                               const std::string             &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
 
     // Do not re-declare if the variable associated with the tensor has already been declared
     auto it = _vars.find(tensor->id());
 
-    if(it != _vars.end())
+    if (it != _vars.end())
     {
         return &it->second;
     }
-    if(comp_group.is_intermediate_tensor(tensor))
+    if (comp_group.is_intermediate_tensor(tensor))
     {
         // Create a virtual tensor variable
         GpuCkwComponentArgument var;
@@ -61,7 +65,7 @@
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto              uniq_name = ss.str();
-        GpuCkwComponentArgument var{ writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage)) };
+        GpuCkwComponentArgument var{writer->declare_tensor_argument(uniq_name, to_ckw(*tensor), to_ckw(storage))};
         auto                  &&inserted = _vars.emplace(tensor->id(), var);
         return &(inserted.first->second);
     }

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
index 0649dcb..2b11891 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_GPUCKWVARIABLETABLE
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 
 #include <map>
@@ -58,8 +59,11 @@
      *
      * @return GpuCkwComponentArgument*
      */
-    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group, GpuCkwScopedKernelWriter &writer, const ITensorInfo *tensor, TensorStorageType storage,
-                                              const std::string &alias = "unnamed");
+    GpuCkwComponentArgument *declare_variable(const GpuKernelComponentGroup &comp_group,
+                                              GpuCkwScopedKernelWriter      &writer,
+                                              const ITensorInfo             *tensor,
+                                              TensorStorageType              storage,
+                                              const std::string             &alias = "unnamed");
 
 private:
     std::map<ITensorInfo::Id, GpuCkwComponentArgument> _vars{};

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
index 14086f7..52e56e2 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/IGpuCkwComponentDriver.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_IGPUCKWCOMPONENTDRIVER
 
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/Types.h"
 
@@ -73,8 +74,7 @@
      * @param[in] id      Component id
      * @param[in] tensors Tensor arguments to the components
      */
-    IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-        : _id{ id }, _tensors{ tensors }
+    IGpuCkwComponentDriver(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
     {
     }
     /** Destructor */
@@ -89,7 +89,9 @@
      *
      *                            @note @p writer can only be passed via value since the new scope is created in the copy constructor
      */
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const = 0;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const = 0;
     /** Get tensor arguments */
     ArgumentPack<ITensorInfo> tensors() const
     {
@@ -128,7 +130,7 @@
     }
 
 private:
-    ComponentId               _id{ -1 };
+    ComponentId               _id{-1};
     ArgumentPack<ITensorInfo> _tensors{};
 };
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
index c07fac0..c3b1b3c 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.cpp

@@ -24,16 +24,18 @@
 #include "GpuCkwActivation.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
 #include <string>
 
 using namespace ckw;
@@ -87,24 +89,25 @@
 GpuCkwActivation::GpuCkwActivation(ComponentId                      id,
                                    const ArgumentPack<ITensorInfo> &tensors,
                                    const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwActivation::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwActivation::write_component_code(const ComponentGroup    &comp_group,
+                                            GpuCkwVariableTable     &vtable,
+                                            GpuCkwScopedKernelWriter writer) const
 {
     const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     load_src_dst_tiles_and_prepare_sampler(writer, src, dst, m0, n0, create_sampler);
 
@@ -119,7 +122,7 @@
     const auto &constant_B       = writer->declare_tile("B_VAL", _attributes.b());
 
     // Perform the operation.
-    switch(_attributes.activation())
+    switch (_attributes.activation())
     {
         case ActivationLayerInfo::ActivationFunction::LOGISTIC:
         {
@@ -179,9 +182,10 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl           = 16;
-    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl = 16;
+    const unsigned int     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
index e157e36..386e933 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwActivation.h

@@ -46,15 +46,15 @@
      * @param[in] tensors    Tensor arguments to the component
      * @param[in] attributes Component attributes
      */
-    GpuCkwActivation(ComponentId                      id,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
+    GpuCkwActivation(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwActivation);
     /** Destructor */
     ~GpuCkwActivation() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window get_window() const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
 
 private:
     const ITensorInfo *_src;

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
index 6ecf2ba..e8e5087 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.cpp

@@ -24,16 +24,18 @@
 #include "GpuCkwCast.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+
 #include <string>
 
 using namespace ckw;
@@ -84,30 +86,29 @@
 }
 } // namespace
 
-GpuCkwCast::GpuCkwCast(ComponentId                      id,
-                       const ArgumentPack<ITensorInfo> &tensors,
-                       const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+GpuCkwCast::GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwCast::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwCast::write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const
 {
     const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Load the source tile and prepare the sampler.
-    if(!src->has_tile())
+    if (!src->has_tile())
     {
         const auto sampler = create_sampler(writer, m0, n0);
         writer->op_load_once(src, sampler);
@@ -122,7 +123,7 @@
     const auto &sampler  = src->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         // Get Target datatype and convert it to ckw::DataType.
         ckw::DataType target_dt = dynamic_fusion::to_ckw(_attributes.data_type());
@@ -143,7 +144,7 @@
     const size_t dst_size  = data_size_from_type(_dst->data_type());
     const bool   cast_down = (src_size >= dst_size);
 
-    if(cast_down && is_data_type_quantized(_src->data_type()))
+    if (cast_down && is_data_type_quantized(_src->data_type()))
     {
         const auto &constant_x80 = writer->declare_tile("0x80", 0x80);
         writer->op_binary_expression(src_tile, src_tile, BinaryOp::BitwiseXOR, constant_x80);
@@ -151,7 +152,7 @@
 
     ckw::ConvertPolicy convert_policy = ckw::ConvertPolicy::None;
 
-    if(cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
+    if (cast_down && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
     {
         convert_policy = ckw::ConvertPolicy::Saturate;
     }
@@ -167,9 +168,10 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl           = 16;
-    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl = 16;
+    const unsigned int     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
index 821cec1..2389301 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwCast.h

@@ -46,15 +46,15 @@
      * @param[in] tensors    Tensor arguments to the component
      * @param[in] attributes Component attributes
      */
-    GpuCkwCast(ComponentId                      id,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
+    GpuCkwCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwCast);
     /** Destructor */
     ~GpuCkwCast() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window get_window() const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
 
 private:
     const ITensorInfo *_src;

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
index 3c90664..7833da2 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.cpp

@@ -25,21 +25,20 @@
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwDirectConv2d.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
-
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
 #include "ckw/TileInfo.h"
 
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
 {
@@ -54,13 +53,7 @@
                                        const ArgumentPack<ITensorInfo> &tensors,
                                        const Attributes                &attributes,
                                        const Settings                  &settings)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _wei{},
-      _bia{},
-      _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _wei{}, _bia{}, _dst{}, _attributes{attributes}, _settings{settings}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _wei = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -69,7 +62,9 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _wei, _dst); // Bias can be null
 }
 
-void GpuCkwDirectConv2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwDirectConv2d::write_component_code(const ComponentGroup    &comp_group,
+                                              GpuCkwVariableTable     &vtable,
+                                              GpuCkwScopedKernelWriter writer) const
 {
     const auto desc = _settings.direct_conv_descriptor();
     ARM_COMPUTE_ERROR_ON_MSG(desc.export_input_to_cl_image || desc.export_output_to_cl_image,
@@ -99,15 +94,18 @@
     // extra loop to compute the left-over elements.
     const bool use_cl_image_for_weights = desc.export_weights_to_cl_image && (k0 == 4) && (K % 4 == 0);
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
     GpuCkwComponentArgument *wei = vtable.declare_variable(
-        comp_group, writer, _wei, use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+        comp_group, writer, _wei,
+        use_cl_image_for_weights ? TensorStorageType::ClImage2dReadOnly : TensorStorageType::ClBufferUint8Ptr, "wei");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
     GpuCkwComponentArgument *bia = nullptr;
 
     const bool using_bias = _bia != nullptr;
 
-    if(using_bias)
+    if (using_bias)
     {
         bia = vtable.declare_variable(comp_group, writer, _bia, TensorStorageType::ClBufferUint8Ptr, "bia");
     }
@@ -154,7 +152,8 @@
     src_sampler.address_mode_x(TensorSamplerAddressModeX::None);
     // We cannot have out-of-bounds reads when the kernel height is equal to 1. Otherwise, we need to ensure the
     // indirection buffer mi does not contain negative values representing out-of-bounds reads.
-    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None : TensorSamplerAddressModeY::SkipMinEdgeOnly);
+    src_sampler.address_mode_y(kernel_height == 1 ? TensorSamplerAddressModeY::None
+                                                  : TensorSamplerAddressModeY::SkipMinEdgeOnly);
     src_sampler.address_mode_z(TensorSamplerAddressModeZ::None);
 
     TensorTileSampler wei_sampler;
@@ -178,7 +177,7 @@
     dst_sampler.z(tile_0);
     dst_sampler.b(tile_bout);
 
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst", TileInfo(to_ckw(_dst->data_type()), m0, n0));
         dst->init_virtual_tensor(tile, dst_sampler);
@@ -189,10 +188,10 @@
 
     // We create a 2d container of size (M0, 1) to store the indices for iteration
     TileContainer it;
-    for(int m = 0; m < m0; ++m)
+    for (int m = 0; m < m0; ++m)
     {
-        std::vector<std::string> idx { std::to_string(m) };
-        it.push_back({ idx });
+        std::vector<std::string> idx{std::to_string(m)};
+        it.push_back({idx});
     }
     const auto &tile_it = writer->declare_tile("it", it, ckw::DataType::Int32);
 
@@ -289,9 +288,9 @@
     // Bias addition
     // NOTE: This operation will be removed from this kernel as the interface is standardized. The intended way of
     // performing bias addition is to fuse this convolution kernel with a following elementwise addition kernel.
-    if(using_bias)
+    if (using_bias)
     {
-        if(!bia->has_tile())
+        if (!bia->has_tile())
         {
             // Reuse the destination sampler for the bias
             writer->op_load_once(bia, dst_sampler);

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
index c8bf999..2935ba4 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.cpp

@@ -24,22 +24,24 @@
 #include "GpuCkwElementwiseBinary.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
 #include "ckw/types/TensorSamplerTypes.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h"
 #include "src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
+
 #include <algorithm>
 #include <string>
 
@@ -53,11 +55,7 @@
 GpuCkwElementwiseBinary::GpuCkwElementwiseBinary(ComponentId                      id,
                                                  const ArgumentPack<ITensorInfo> &tensors,
                                                  const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _lhs{},
-      _rhs{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuCkwComponentDriver{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
 {
     _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -65,15 +63,20 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(_lhs, _rhs, _dst);
 }
 
-void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwElementwiseBinary::write_component_code(const ComponentGroup    &comp_group,
+                                                   GpuCkwVariableTable     &vtable,
+                                                   GpuCkwScopedKernelWriter writer) const
 {
     const auto root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const auto n0          = static_cast<int32_t>(root_window.x().step());
     const auto m0          = static_cast<int32_t>(root_window.y().step());
 
-    GpuCkwComponentArgument *lhs = vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
-    GpuCkwComponentArgument *rhs = vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *lhs =
+        vtable.declare_variable(comp_group, writer, _lhs, TensorStorageType::ClBufferUint8Ptr, "lhs");
+    GpuCkwComponentArgument *rhs =
+        vtable.declare_variable(comp_group, writer, _rhs, TensorStorageType::ClBufferUint8Ptr, "rhs");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     auto &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
     auto &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
@@ -86,32 +89,36 @@
     auto &const_0 = writer->declare_tile("0", 0);
 
     // Load the LHS and RHS tiles
-    if(!lhs->has_tile())
+    if (!lhs->has_tile())
     {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1), n0, m0, "lhs_", const_0);
+        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _lhs->dimension(0), _lhs->dimension(1),
+                                                        n0, m0, "lhs_", const_0);
         sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
         sampler.z(const_0);
         sampler.b(gid_2);
         writer->op_load_once(lhs, sampler);
     }
-    if(!rhs->has_tile())
+    if (!rhs->has_tile())
     {
-        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1), n0, m0, "rhs_", const_0);
+        auto sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _rhs->dimension(0), _rhs->dimension(1),
+                                                        n0, m0, "rhs_", const_0);
         sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
         sampler.z(const_0);
         sampler.b(gid_2);
         writer->op_load_once(rhs, sampler);
     }
 
-    auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1), n0, m0, "dst_", const_0);
+    auto dst_sampler = create_boundary_aware_2d_sampler(writer, gid_0, gid_1, _dst->dimension(0), _dst->dimension(1),
+                                                        n0, m0, "dst_", const_0);
     dst_sampler.format(TensorSamplerFormat::C_WH_1); // 3rd dimension collapsed with 2nd dimension
     dst_sampler.z(const_0);
     dst_sampler.b(gid_2);
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
-        auto &tile = writer->declare_tile("dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width()));
+        auto &tile = writer->declare_tile(
+            "dst_tile", ckw::TileInfo(to_ckw(_dst->data_type()), dst_sampler.height(), dst_sampler.width()));
         dst->init_virtual_tensor(tile, dst_sampler);
     }
 
@@ -131,9 +138,10 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    constexpr unsigned int vector_size_byte_opencl           = 16;
-    const unsigned int     num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window                 win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    constexpr unsigned int vector_size_byte_opencl = 16;
+    const unsigned int     num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }
@@ -141,11 +149,12 @@
 std::string GpuCkwElementwiseBinary::get_name(const ComponentGroup &comp_group) const
 {
     ARM_COMPUTE_UNUSED(comp_group);
-    const std::vector<std::string> build_params =
-    {
+    const std::vector<std::string> build_params = {
         "elementwise_binary",
-        "op", to_string(_attributes.operation()),
-        "dt", lower_string(string_from_data_type(_dst->data_type())),
+        "op",
+        to_string(_attributes.operation()),
+        "dt",
+        lower_string(string_from_data_type(_dst->data_type())),
     };
     return join(build_params, "_");
 }
@@ -154,13 +163,16 @@
 {
     ARM_COMPUTE_UNUSED(comp_group);
     /// NOTE: Hardcoded for now, the parameters should ideally be exported by ckw (a selection of constant tiles)
-    std::vector<std::string> build_params =
-    {
+    std::vector<std::string> build_params = {
         "elementwise_binary",
-        "op", to_string(_attributes.operation()),
-        "dt", lower_string(string_from_data_type(_dst->data_type())),
-        "dst_dim0", support::cpp11::to_string(_dst->dimension(0)),
-        "dst_dim1", support::cpp11::to_string(_dst->dimension(1)),
+        "op",
+        to_string(_attributes.operation()),
+        "dt",
+        lower_string(string_from_data_type(_dst->data_type())),
+        "dst_dim0",
+        support::cpp11::to_string(_dst->dimension(0)),
+        "dst_dim1",
+        support::cpp11::to_string(_dst->dimension(1)),
     };
     return join(build_params, "_");
 }

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
index e9c4153..1a20d4c 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwElementwiseBinary.h

@@ -46,17 +46,17 @@
      * @param[in] tensors    Tensor arguments to the component
      * @param[in] attributes Component attributes
      */
-    GpuCkwElementwiseBinary(ComponentId                      id,
-                            const ArgumentPack<ITensorInfo> &tensors,
-                            const Attributes                &attributes);
+    GpuCkwElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(GpuCkwElementwiseBinary);
     /** Destructor */
     ~GpuCkwElementwiseBinary() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window      get_window() const override;
-    std::string get_name(const ComponentGroup &comp_group) const override;
-    std::string get_tuner_id(const ComponentGroup &comp_group) const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
+    std::string  get_tuner_id(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_lhs;

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
index 9c9a298..8ab3ec3 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.cpp

@@ -24,17 +24,18 @@
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/Validate.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/core/helpers/WindowHelpers.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 using namespace ckw;
 
@@ -48,11 +49,7 @@
                            const ArgumentPack<ITensorInfo> &tensors,
                            const Attributes                &attributes,
                            const Settings                  &settings)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
 
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
@@ -60,14 +57,18 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwPool2d::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwPool2d::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
 {
     const auto         root_window = comp_group.get_root_component()->ckw_component_driver()->get_window();
     const unsigned int n0          = root_window.x().step();
     const unsigned int m0          = root_window.y().step();
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     TileOperand &gid_0 = writer->declare_tile("gid_0", ckw::DataType::Int32);
     TileOperand &gid_1 = writer->declare_tile("gid_1", ckw::DataType::Int32);
@@ -90,23 +91,26 @@
     const auto    src_data_type = _src->data_type();
 
     // Check if this is global pooling path
-    const bool is_global_pooling = (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0);
+    const bool is_global_pooling =
+        (pool_size_x == src_width) && (pool_size_y == src_height) && (pad_x == 0) && (pad_y == 0);
     // Check if this a case of FP_MIXED_PRECISION
-    const bool use_fp_mixed_precision = (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
-    const auto acc_data_type          = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type);
+    const bool use_fp_mixed_precision =
+        (src_data_type == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
+    const auto acc_data_type = (use_fp_mixed_precision) ? (DataType::F32) : (src_data_type);
 
     TileOperand       &const_0            = writer->declare_tile("0", 0);
     const TileOperand &const_1            = writer->declare_tile("1", 1);
     const TileOperand &const_lowest_value = writer->declare_tile("LOWEST_VALUE", std::numeric_limits<float>::lowest());
     const TileOperand &pool_size_x_tile   = writer->declare_tile("POOL_SIZE_X", pool_size_x);
     const TileOperand &pool_size_y_tile   = writer->declare_tile("POOL_SIZE_Y", pool_size_y);
-    const TileOperand &stride_x_tile      = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
-    const TileOperand &stride_y_tile      = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
-    const TileOperand &pad_x_tile         = writer->declare_tile("PAD_X", pad_x);
-    const TileOperand &pad_y_tile         = writer->declare_tile("PAD_Y", pad_y);
-    const TileOperand &dst_height_tile    = writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
-    const TileOperand &src_height_tile    = writer->declare_tile("SRC_HEIGHT", src_height);
-    const TileOperand &src_width_tile     = writer->declare_tile("SRC_WIDTH", src_width);
+    const TileOperand &stride_x_tile = writer->declare_tile("STRIDE_X", static_cast<int32_t>(_attributes.stride().x()));
+    const TileOperand &stride_y_tile = writer->declare_tile("STRIDE_Y", static_cast<int32_t>(_attributes.stride().y()));
+    const TileOperand &pad_x_tile    = writer->declare_tile("PAD_X", pad_x);
+    const TileOperand &pad_y_tile    = writer->declare_tile("PAD_Y", pad_y);
+    const TileOperand &dst_height_tile =
+        writer->declare_tile("DST_HEIGHT", static_cast<int32_t>(_dst->dimension(height_idx)));
+    const TileOperand &src_height_tile = writer->declare_tile("SRC_HEIGHT", src_height);
+    const TileOperand &src_width_tile  = writer->declare_tile("SRC_WIDTH", src_width);
 
     TileOperand &idx_out_n = writer->declare_tile("idx_out_n", ckw::DataType::Int32);
     TileOperand &idx_out_h = writer->declare_tile("idx_out_h", ckw::DataType::Int32);
@@ -145,7 +149,7 @@
 
     // Prepare dst tensor and tile
     TileInfo dst_tile_info = TileInfo(to_ckw(src_data_type), m0, n0);
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         TileOperand &dst_tile = writer->declare_tile("dst_tile", dst_tile_info);
         dst->init_virtual_tensor(dst_tile, dst_sampler);
@@ -156,14 +160,15 @@
     const TileOperand &res_tile = writer->declare_tile("res_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
 
     // Initialise result tile with appropriate value
-    if(_attributes.pool_type() == PoolingType::MAX)
+    if (_attributes.pool_type() == PoolingType::MAX)
     {
-        if(_settings.use_inf_as_limit())
+        if (_settings.use_inf_as_limit())
         {
             TileContainer            minus_inf_tile_container;
             std::vector<std::string> value = std::vector<std::string>(n0, "(-INFINITY)");
-            minus_inf_tile_container.push_back({ value });
-            const TileOperand &minus_inf = writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type));
+            minus_inf_tile_container.push_back({value});
+            const TileOperand &minus_inf =
+                writer->declare_tile("minus_inf_const", minus_inf_tile_container, to_ckw(acc_data_type));
             writer->op_assign(res_tile, minus_inf);
         }
         else
@@ -209,7 +214,7 @@
     writer->op_binary_elementwise_function(pool_y_e, BinaryFunction::Min, pool_size_y_tile, pool_y_e);
 
     const TileOperand &filter_size = writer->declare_tile("filter_size", ckw::DataType::Int32);
-    if(_attributes.exclude_padding())
+    if (_attributes.exclude_padding())
     {
         const TileOperand &y_diff = writer->declare_tile("y_diff", ckw::DataType::Int32);
         const TileOperand &x_diff = writer->declare_tile("x_diff", ckw::DataType::Int32);
@@ -227,7 +232,7 @@
     const TileOperand &x = writer->declare_tile("x", ckw::DataType::Int32);
     const TileOperand &y = writer->declare_tile("y", ckw::DataType::Int32);
 
-    if(is_global_pooling)
+    if (is_global_pooling)
     {
         writer->op_assign(x, const_0);
         writer->op_assign(y, const_0);
@@ -242,76 +247,80 @@
     }
 
     // Y dim for-loop
-    writer->op_for_loop(y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1, [&]()
-    {
-        // Reset the iterator for the inner loop
-        if(is_global_pooling)
+    writer->op_for_loop(
+        y, BinaryOp::Less, pool_y_e, y, AssignmentOp::Increment, const_1,
+        [&]()
         {
-            writer->op_assign(x, const_0);
-        }
-        else
-        {
-            writer->op_assign(x, pool_x_s);
-        }
-
-        TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
-        writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y);
-
-        // X dim for-loop
-        writer->op_for_loop(x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1, [&]()
-        {
-            TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
-            writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x);
-
-            TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
-
-            src_sampler.y(a_x);
-            src_sampler.z(a_y);
-
-            // Load src tile
-            if(use_fp_mixed_precision)
+            // Reset the iterator for the inner loop
+            if (is_global_pooling)
             {
-                TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info);
-                writer->op_load(src_uncasted_tile, src->tensor(), src_sampler);
-                writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None);
+                writer->op_assign(x, const_0);
             }
             else
             {
-                writer->op_load(src_tile, src->tensor(), src_sampler);
+                writer->op_assign(x, pool_x_s);
             }
 
-            // Take the square of the input, for L2 Pooling
-            if(_attributes.pool_type() == PoolingType::L2)
-            {
-                writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile);
-            }
+            TileOperand &a_y = writer->declare_tile("a_y", ckw::DataType::Int32);
+            writer->op_binary_expression(a_y, idx_in_h, BinaryOp::Add, y);
 
-            // Perfom Pooling op
-            if(_attributes.pool_type() == PoolingType::MAX)
-            {
-                writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile);
-            }
-            else
-            {
-                writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile);
-            }
+            // X dim for-loop
+            writer->op_for_loop(
+                x, BinaryOp::Less, pool_x_e, x, AssignmentOp::Increment, const_1,
+                [&]()
+                {
+                    TileOperand &a_x = writer->declare_tile("a_x", ckw::DataType::Int32);
+                    writer->op_binary_expression(a_x, idx_in_w, BinaryOp::Add, x);
+
+                    TileOperand &src_tile = writer->declare_tile("src_tile", TileInfo(to_ckw(acc_data_type), m0, n0));
+
+                    src_sampler.y(a_x);
+                    src_sampler.z(a_y);
+
+                    // Load src tile
+                    if (use_fp_mixed_precision)
+                    {
+                        TileOperand &src_uncasted_tile = writer->declare_tile("uncasted_src_tile", dst_tile_info);
+                        writer->op_load(src_uncasted_tile, src->tensor(), src_sampler);
+                        writer->op_cast_expression(src_tile, src_uncasted_tile, ckw::ConvertPolicy::None);
+                    }
+                    else
+                    {
+                        writer->op_load(src_tile, src->tensor(), src_sampler);
+                    }
+
+                    // Take the square of the input, for L2 Pooling
+                    if (_attributes.pool_type() == PoolingType::L2)
+                    {
+                        writer->op_binary_expression(src_tile, src_tile, BinaryOp::Mul, src_tile);
+                    }
+
+                    // Perfom Pooling op
+                    if (_attributes.pool_type() == PoolingType::MAX)
+                    {
+                        writer->op_binary_elementwise_function(res_tile, BinaryFunction::Max, res_tile, src_tile);
+                    }
+                    else
+                    {
+                        writer->op_binary_expression(res_tile, res_tile, BinaryOp::Add, src_tile);
+                    }
+                });
         });
-    });
 
-    if((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
+    if ((_attributes.pool_type() == PoolingType::AVG) || (_attributes.pool_type() == PoolingType::L2))
     {
         // filter_size is automatically broadcasted in the operation
         writer->op_binary_expression(res_tile, res_tile, BinaryOp::Div, filter_size);
     }
 
     // Take square root of the result in L2 pooling
-    if(_attributes.pool_type() == PoolingType::L2)
+    if (_attributes.pool_type() == PoolingType::L2)
     {
         writer->op_unary_elementwise_function(res_tile, UnaryFunction::Sqrt, res_tile);
     }
 
     // Store the results and do casting if FP_MIXED_PRECISION
-    if(use_fp_mixed_precision)
+    if (use_fp_mixed_precision)
     {
         writer->op_cast_expression(dst_tile, res_tile, ckw::ConvertPolicy::None);
     }
@@ -326,7 +335,7 @@
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
 
     TensorShape        output_shape = _dst->tensor_shape();
-    const unsigned int vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
     // Create and configure kernel window
     auto win = calculate_max_window(output_shape, Steps(vec_size));
     win      = win.collapse_if_possible(win, Window::DimZ); // collapse window on batch size.

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
index 2ccf255..822282a 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h

@@ -59,9 +59,11 @@
     /** Destructor */
     ~GpuCkwPool2d() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    Window      get_window() const override;
-    std::string get_name(const ComponentGroup &comp_group) const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    Window       get_window() const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_src;

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
index d997c82..f2a7d41 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwResize.cpp

@@ -28,14 +28,13 @@
 
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
+#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h"
-#include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h"
-
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -49,20 +48,17 @@
 constexpr unsigned int opencl_vector_size_in_bytes = 16;
 } // namespace
 
-GpuCkwResize::GpuCkwResize(ComponentId                      id,
-                           const ArgumentPack<ITensorInfo> &tensors,
-                           const Attributes                &attributes)
-    : IGpuCkwComponentDriver{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+GpuCkwResize::GpuCkwResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
 }
 
-void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwResize::do_nearest_neighbor_resize(const ComponentGroup    &comp_group,
+                                              GpuCkwVariableTable     &vtable,
+                                              GpuCkwScopedKernelWriter writer) const
 {
     const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
     const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
@@ -72,12 +68,16 @@
     const int32_t m0          = root_window.y().step();
     const int32_t partial_n0  = _dst->dimension(0) % n0;
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Constants
-    const float scale_x      = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners());
-    const float scale_y      = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners());
+    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+                                                              _attributes.align_corners());
+    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+                                                              _attributes.align_corners());
     const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
     const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
     const auto &tile_0       = writer->declare_tile("0", 0);
@@ -112,7 +112,7 @@
     const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
     const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
 
-    switch(_attributes.sampling_policy())
+    switch (_attributes.sampling_policy())
     {
         case SamplingPolicy::TOP_LEFT:
             // xi_f = (xo * scale_x)
@@ -138,7 +138,7 @@
             ARM_COMPUTE_ERROR("Unsupported sampling policy");
     }
 
-    if(_attributes.align_corners())
+    if (_attributes.align_corners())
     {
         writer->op_unary_elementwise_function(tile_xi_f, UnaryFunction::Round, tile_xi_f);
         writer->op_unary_elementwise_function(tile_yi_f, UnaryFunction::Round, tile_yi_f);
@@ -161,8 +161,10 @@
     auto &tile_xi0 = writer->declare_tile("xi0", ckw::DataType::Int32);
     auto &tile_yi0 = writer->declare_tile("yi0", ckw::DataType::Int32);
 
-    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0, tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0, tile_src_h_minus_1);
+    writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi_f_int, tile_0,
+                                            tile_src_w_minus_1);
+    writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi_f_int, tile_0,
+                                            tile_src_h_minus_1);
 
     TensorTileSampler src_sampler;
     src_sampler.x(tile_co);
@@ -199,7 +201,9 @@
     writer->op_assign(tile_dst, tile_src);
 }
 
-void GpuCkwResize::do_bilinear_resize(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwResize::do_bilinear_resize(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const
 {
     const size_t width_idx  = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::WIDTH);
     const size_t height_idx = get_data_layout_dimension_index(_dst->data_layout(), DataLayoutDimension::HEIGHT);
@@ -209,12 +213,16 @@
     const int32_t m0          = root_window.y().step();
     const int32_t partial_n0  = _dst->dimension(0) % n0;
 
-    GpuCkwComponentArgument *src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
-    GpuCkwComponentArgument *dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
+    GpuCkwComponentArgument *src =
+        vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
+    GpuCkwComponentArgument *dst =
+        vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");
 
     // Constants
-    const float scale_x      = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx), _attributes.align_corners());
-    const float scale_y      = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx), _attributes.align_corners());
+    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(width_idx), _dst->dimension(width_idx),
+                                                              _attributes.align_corners());
+    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(height_idx), _dst->dimension(height_idx),
+                                                              _attributes.align_corners());
     const auto &tile_scale_x = writer->declare_tile("scale_x", scale_x);
     const auto &tile_scale_y = writer->declare_tile("scale_y", scale_y);
     const auto &tile_0       = writer->declare_tile("0", 0);
@@ -251,7 +259,7 @@
     const auto &tile_xi_f = writer->declare_tile("xi_f", ckw::DataType::Fp32);
     const auto &tile_yi_f = writer->declare_tile("yi_f", ckw::DataType::Fp32);
 
-    switch(_attributes.sampling_policy())
+    switch (_attributes.sampling_policy())
     {
         case SamplingPolicy::TOP_LEFT:
             // xi_f = (xo * scale_x)
@@ -312,8 +320,10 @@
 
     writer->op_ternary_elementwise_function(tile_xi0, TernaryFunction::Clamp, tile_xi, tile_0, tile_src_w_minus_1);
     writer->op_ternary_elementwise_function(tile_yi0, TernaryFunction::Clamp, tile_yi, tile_0, tile_src_h_minus_1);
-    writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0, tile_src_w_minus_1);
-    writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0, tile_src_h_minus_1);
+    writer->op_ternary_elementwise_function(tile_xi1, TernaryFunction::Clamp, tile_xi_plus_1, tile_0,
+                                            tile_src_w_minus_1);
+    writer->op_ternary_elementwise_function(tile_yi1, TernaryFunction::Clamp, tile_yi_plus_1, tile_0,
+                                            tile_src_h_minus_1);
 
     TensorTileSampler in_sampler;
     in_sampler.x(tile_co);
@@ -388,7 +398,7 @@
     writer->op_binary_expression(tile_a1, tile_yi_f, BinaryOp::Sub, tile_yi_float);
     writer->op_binary_expression(tile_b1, tile_1, BinaryOp::Sub, tile_a1);
 
-    if(is_data_type_float(_src->data_type()))
+    if (is_data_type_float(_src->data_type()))
     {
         // Cast weights to source type
         const auto &tile_a_src_type  = writer->declare_tile("a_src_t", to_ckw(_src->data_type()));
@@ -461,9 +471,11 @@
     }
 }
 
-void GpuCkwResize::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwResize::write_component_code(const ComponentGroup    &comp_group,
+                                        GpuCkwVariableTable     &vtable,
+                                        GpuCkwScopedKernelWriter writer) const
 {
-    switch(_attributes.interpolation_policy())
+    switch (_attributes.interpolation_policy())
     {
         case InterpolationPolicy::NEAREST_NEIGHBOR:
             do_nearest_neighbor_resize(comp_group, vtable, writer);

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
index 8917391..889706b 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.cpp

@@ -24,10 +24,12 @@
 #include "GpuCkwStore.h"
 
 #include "arm_compute/core/Error.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwVariableTable.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
+
 #include <string>
 
 namespace arm_compute
@@ -37,12 +39,14 @@
 namespace dynamic_fusion
 {
 GpuCkwStore::GpuCkwStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuCkwComponentDriver{ id, tensors }, _src{}, _dst{}
+    : IGpuCkwComponentDriver{id, tensors}, _src{}, _dst{}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
 }
-void GpuCkwStore::write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const
+void GpuCkwStore::write_component_code(const ComponentGroup    &comp_group,
+                                       GpuCkwVariableTable     &vtable,
+                                       GpuCkwScopedKernelWriter writer) const
 {
     auto src = vtable.declare_variable(comp_group, writer, _src, TensorStorageType::ClBufferUint8Ptr, "src");
     auto dst = vtable.declare_variable(comp_group, writer, _dst, TensorStorageType::ClBufferUint8Ptr, "dst");

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
index 8e35651..f1f0e67 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwStore.h

@@ -48,8 +48,10 @@
     /** Destructor */
     ~GpuCkwStore() override = default;
     // Inherited methods overriden:
-    virtual void write_component_code(const ComponentGroup &comp_group, GpuCkwVariableTable &vtable, GpuCkwScopedKernelWriter writer) const override;
-    std::string get_name(const ComponentGroup &comp_group) const override;
+    virtual void write_component_code(const ComponentGroup    &comp_group,
+                                      GpuCkwVariableTable     &vtable,
+                                      GpuCkwScopedKernelWriter writer) const override;
+    std::string  get_name(const ComponentGroup &comp_group) const override;
 
 private:
     const ITensorInfo *_src;

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
index e2b8584..6ba2b2f 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/WriterHelper.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/misc/Utility.h"
 #include "ckw/TensorTileSampler.h"
+
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwComponentArgument.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwKernelWriter.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/GpuCkwScopedKernelWriter.h"
@@ -44,9 +45,14 @@
 
 /** Load src and dst tiles of dimension [m0, n0] only when not loaded and prepare the sampler
  */
-inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &writer, GpuCkwComponentArgument *src, GpuCkwComponentArgument *dst, int32_t m0, int32_t n0, SamplerCreator create_sampler)
+inline void load_src_dst_tiles_and_prepare_sampler(GpuCkwScopedKernelWriter &writer,
+                                                   GpuCkwComponentArgument  *src,
+                                                   GpuCkwComponentArgument  *dst,
+                                                   int32_t                   m0,
+                                                   int32_t                   n0,
+                                                   SamplerCreator            create_sampler)
 {
-    if(!src->has_tile())
+    if (!src->has_tile())
     {
         const auto sampler = create_sampler(writer, m0, n0);
         writer->op_load_once(src, sampler);
@@ -61,7 +67,7 @@
     const auto &sampler  = src->tile_sampler();
 
     // Prepare the output tile.
-    if(!dst->has_tile())
+    if (!dst->has_tile())
     {
         auto &tile = writer->declare_tile("dst_tile", src_tile.tile_info());
         dst->init_virtual_tensor(tile, sampler);
@@ -78,7 +84,13 @@
  * @param[in]     prefix          Prefix to all the tiles declared within this function
  * @param[in]     const_0         Constant tile of value 0
  */
-inline void get_coord(GpuCkwScopedKernelWriter writer, TileOperand &coord, const TileOperand &gid, int32_t step_v, int32_t leftover_step_v, const std::string &prefix, const TileOperand &const_0)
+inline void get_coord(GpuCkwScopedKernelWriter writer,
+                      TileOperand             &coord,
+                      const TileOperand       &gid,
+                      int32_t                  step_v,
+                      int32_t                  leftover_step_v,
+                      const std::string       &prefix,
+                      const TileOperand       &const_0)
 {
     auto &step          = writer->declare_tile(prefix + "step", step_v);
     auto &leftover_step = writer->declare_tile(prefix + "leftover_step", leftover_step_v);
@@ -122,8 +134,15 @@
  *
  * @return TensorTileSampler
  */
-inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer, TileOperand &gid_0, TileOperand &gid_1, int32_t dim0_v, int32_t dim1_v, int32_t n0_v, int32_t m0_v,
-                                                          const std::string prefix, TileOperand &const_0)
+inline TensorTileSampler create_boundary_aware_2d_sampler(GpuCkwScopedKernelWriter writer,
+                                                          TileOperand             &gid_0,
+                                                          TileOperand             &gid_1,
+                                                          int32_t                  dim0_v,
+                                                          int32_t                  dim1_v,
+                                                          int32_t                  n0_v,
+                                                          int32_t                  m0_v,
+                                                          const std::string        prefix,
+                                                          TileOperand             &const_0)
 {
     // Clamp tile size [n0, m0] against dimension [dim0, dim1]
     // This is needed to:

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
index 34b1283..5da317b 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/Common.h

@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/Types.h"
 #include "ckw/TensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 
 namespace arm_compute
@@ -38,7 +39,7 @@
 {
 inline ckw::DataType to_ckw(DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::F32:
             return ckw::DataType::Fp32;
@@ -65,21 +66,16 @@
 
 inline ckw::TensorShape to_ckw(const TensorShape &shape)
 {
-    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape> {});
-    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape> {} != 5);
+    ARM_COMPUTE_ERROR_ON(shape.num_max_dimensions < std::tuple_size<ckw::TensorShape>{});
+    ARM_COMPUTE_ERROR_ON(std::tuple_size<ckw::TensorShape>{} != 5);
     /// NOTE: Overflow danger. Use size_t?
-    return ckw::TensorShape
-    {
-        static_cast<int32_t>(shape[0]),
-        static_cast<int32_t>(shape[1]),
-        static_cast<int32_t>(shape[2]),
-        static_cast<int32_t>(shape[3]),
-        static_cast<int32_t>(shape[4])
-    };
+    return ckw::TensorShape{static_cast<int32_t>(shape[0]), static_cast<int32_t>(shape[1]),
+                            static_cast<int32_t>(shape[2]), static_cast<int32_t>(shape[3]),
+                            static_cast<int32_t>(shape[4])};
 }
 inline ckw::TensorDataLayout to_ckw(DataLayout dl)
 {
-    switch(dl)
+    switch (dl)
     {
         case DataLayout::NHWC:
             return ckw::TensorDataLayout::Nhwc;
@@ -91,18 +87,13 @@
 }
 inline ckw::TensorInfo to_ckw(const ITensorInfo &tensor_info)
 {
-    return ckw::TensorInfo
-    {
-        to_ckw(tensor_info.data_type()),
-        to_ckw(tensor_info.tensor_shape()),
-        to_ckw(tensor_info.data_layout()),
-        tensor_info.id()
-    };
+    return ckw::TensorInfo{to_ckw(tensor_info.data_type()), to_ckw(tensor_info.tensor_shape()),
+                           to_ckw(tensor_info.data_layout()), tensor_info.id()};
 }
 
 inline TensorComponentType from_ckw(const ckw::TensorComponentType &component)
 {
-    switch(component)
+    switch (component)
     {
         case ckw::TensorComponentType::OffsetFirstElement:
             return TensorComponentType::OffsetFirstElement;
@@ -142,7 +133,7 @@
 
 inline ckw::TensorStorageType to_ckw(const TensorStorageType &storage)
 {
-    switch(storage)
+    switch (storage)
     {
         case TensorStorageType::ClBufferUint8Ptr:
             return ckw::TensorStorageType::BufferUint8Ptr;
@@ -159,7 +150,7 @@
 }
 inline TensorStorageType from_ckw(const ckw::TensorStorageType &storage)
 {
-    switch(storage)
+    switch (storage)
     {
         case ckw::TensorStorageType::BufferUint8Ptr:
             return TensorStorageType::ClBufferUint8Ptr;

diff --git a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
index 9cb022f..0cba258 100644
--- a/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/ckw_driver/components/utils/type_converter/ElementwiseBinary.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_CKW_DRIVER_COMPONENTS_UTILS_TYPE_CONVERTER_ELEMENTWISEBINARY
 
 #include "ckw/types/Operators.h"
+
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
 
 namespace arm_compute
@@ -35,7 +36,7 @@
 {
 inline ckw::BinaryOp to_ckw(const ElementwiseBinaryCommonAttributes &attributes)
 {
-    switch(attributes.operation())
+    switch (attributes.operation())
     {
         case ElementwiseBinaryCommonAttributes::ElementwiseOp::Add:
             return ckw::BinaryOp::Add;

diff --git a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
index f7f0029..ee109a7 100644
--- a/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h
+++ b/src/dynamic_fusion/sketch/gpu/components/GpuKernelComponentFactory.h

@@ -24,8 +24,9 @@
 #ifndef SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_GPUKERNELCOMPONENTFACTORY
 
-#include "Types.h"
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
+#include "Types.h"
 #include <memory>
 
 namespace arm_compute
@@ -49,13 +50,13 @@
      * @return std::unique_ptr<IGpuKernelComponent>
      */
     template <typename T, typename... Args>
-    std::unique_ptr<IGpuKernelComponent> create(Args &&... args)
+    std::unique_ptr<IGpuKernelComponent> create(Args &&...args)
     {
         return std::make_unique<T>(_count++, std::forward<Args>(args)...);
     }
 
 private:
-    ComponentId _count{ 0 };
+    ComponentId _count{0};
 };
 
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
index af766a7..4b8eea2 100644
--- a/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h
+++ b/src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h

@@ -24,11 +24,11 @@
 #ifndef ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_IGPUKERNELCOMPONENT
 
-#include "Types.h"
-
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
 
+#include "Types.h"
+
 namespace arm_compute
 {
 namespace experimental
@@ -76,13 +76,8 @@
      * @param[in] properties Kernel component properties
      * @param[in] tensors    Tensor arguments to the components
      */
-    IGpuKernelComponent(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors)
-        : _id{ id },
-          _properties{ properties },
-          _tensors{ tensors }
+    IGpuKernelComponent(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
+        : _id{id}, _properties{properties}, _tensors{tensors}
     {
     }
     /** Destructor */
@@ -117,7 +112,7 @@
     virtual GpuComponentType type() const = 0;
 
 private:
-    ComponentId               _id{ -1 };
+    ComponentId               _id{-1};
     Properties                _properties{};
     ArgumentPack<ITensorInfo> _tensors{};
 };

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
index c41257d..fdf528a 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.cpp

@@ -68,17 +68,11 @@
                                              const IGpuKernelComponent::Properties &properties,
                                              const ArgumentPack<ITensorInfo>       &tensors,
                                              const Attributes                      &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateActivation>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<ClTemplateActivation>(id, tensors, attributes)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwActivation>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<GpuCkwActivation>(id, tensors, attributes)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
index 9b090af..02c8543 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h

@@ -25,9 +25,8 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTACTIVATION
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
-#include "arm_compute/function_info/ActivationLayerInfo.h"
+#include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
 {
@@ -79,20 +78,17 @@
      * |F16            |F16            |
      * |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
      * Similar to @ref ClComponentActivation::validate()
      */
-    ClComponentActivation(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    ClComponentActivation(ComponentId                      id,
+                          const Properties                &properties,
+                          const ArgumentPack<ITensorInfo> &tensors,
+                          const Attributes                &attributes);
 
     /** Destructor */
     ~ClComponentActivation() override;

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
index 635869f..b163679 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.cpp

@@ -24,6 +24,7 @@
 #include "ClComponentCast.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
@@ -38,11 +39,10 @@
 {
 namespace dynamic_fusion
 {
-Status ClComponentCast::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentCast::validate(const Properties                &properties,
+                                 const ArgumentPack<ITensorInfo> &tensors,
+                                 const Attributes                &attributes,
+                                 const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties, attributes, settings);
 
@@ -53,13 +53,15 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(), "input and target data types should be different");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == attributes.data_type(),
+                                    "input and target data types should be different");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(), "dst and target data types should be same");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != attributes.data_type(),
+                                        "dst and target data types should be same");
     }
 
     return Status{};
@@ -69,17 +71,11 @@
                                  const ArgumentPack<ITensorInfo> &tensors,
                                  const Attributes                &attributes,
                                  const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateCast>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<ClTemplateCast>(id, tensors, attributes)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwCast>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<GpuCkwCast>(id, tensors, attributes)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
     ARM_COMPUTE_UNUSED(attributes, settings);

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
index 37b8cbb..ed77b12 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTCAST
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/CastAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -93,11 +94,10 @@
      * |F16            | U8, S8, U16, S16, U32, S32, F32       |
      * |F32            | U8, S8, U16, S16, U32, S32, F16       |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
index 5626093..d95e0be 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h"
 
@@ -103,11 +104,10 @@
     return _m0;
 }
 
-Status ClComponentDepthwiseConv2d::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentDepthwiseConv2d::validate(const Properties                &properties,
+                                            const ArgumentPack<ITensorInfo> &tensors,
+                                            const Attributes                &attributes,
+                                            const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties, settings);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -121,7 +121,7 @@
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
     }
@@ -129,7 +129,7 @@
     // Matching data layout
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
     }
@@ -138,7 +138,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
     }
@@ -148,16 +148,17 @@
     const DataLayout data_layout = src->data_layout();
     const size_t     channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * attributes.depth_multiplier()));
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+                                (src->dimension(channel_idx) * attributes.depth_multiplier()));
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 3, "Weights can be at most 3 dimensional");
 
     // dst shape is correct
-    const PadStrideInfo pad_stride_info = PadStrideInfo(attributes.stride().x(), attributes.stride().y(),
-                                                        attributes.pad().left, attributes.pad().right,
-                                                        attributes.pad().top, attributes.pad().bottom,
-                                                        attributes.dimension_rounding_type());
-    const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() };
-    const TensorShape     output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+    const PadStrideInfo pad_stride_info =
+        PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right,
+                      attributes.pad().top, attributes.pad().bottom, attributes.dimension_rounding_type());
+    const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+                                    attributes.dilation()};
+    const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), output_shape);
 
@@ -168,19 +169,22 @@
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_stride_info.stride().first > 1 && settings.m0() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv_info.dilation.x() > 1 && settings.m0() != 1);
 
-    if(conv_info.depth_multiplier > 1 && settings.n0() > 1)
+    if (conv_info.depth_multiplier > 1 && settings.n0() > 1)
     {
         ARM_COMPUTE_RETURN_ERROR_ON((conv_info.depth_multiplier % settings.n0()) != 0);
     }
 
     // Check export weights to cl image
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) && (export_to_cl_image(wei) == false), "Weights cannot be exported to cl_image!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((settings.export_weights_to_cl_image() == true) &&
+                                        (export_to_cl_image(wei) == false),
+                                    "Weights cannot be exported to cl_image!");
     ARM_COMPUTE_RETURN_ERROR_ON((settings.export_weights_to_cl_image() == true) && ((settings.n0() % 4) != 0));
 
-    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) != (src->dimension(channel_idx) * conv_info.depth_multiplier));
+    ARM_COMPUTE_RETURN_ERROR_ON(wei->dimension(channel_idx) !=
+                                (src->dimension(channel_idx) * conv_info.depth_multiplier));
 
     // bia shape is correct
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != output_shape[channel_idx],
                                         "Biases size and number of dst feature maps should match");
@@ -198,14 +202,13 @@
     return Status{};
 }
 
-ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateDepthwiseConv2d>(id, tensors, attributes, settings) }
+ClComponentDepthwiseConv2d::ClComponentDepthwiseConv2d(ComponentId                      id,
+                                                       const Properties                &properties,
+                                                       const ArgumentPack<ITensorInfo> &tensors,
+                                                       const Attributes                &attributes,
+                                                       const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<ClTemplateDepthwiseConv2d>(id, tensors, attributes, settings)}
 {
 }
 ClComponentDepthwiseConv2d::~ClComponentDepthwiseConv2d()

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
index 0e2b5f1..b3e1bd2 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h

@@ -25,7 +25,9 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTDEPTHWISECONV2D
 
 #include "arm_compute/core/Error.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -77,12 +79,12 @@
     unsigned int m0() const;
 
 private:
-    bool         _export_input_to_cl_image{ false };   /**< Export input to cl_image */
-    bool         _export_weights_to_cl_image{ false }; /**< Export the weights to cl_image */
-    bool         _fast_relaxed_math{ true };           /**< Enable/disable -cl-fast-relaxed-math flag */
-    bool         _is_fma_available{ false };           /**< Is fma instruction available */
-    unsigned int _n0{ 0 };                             /**< Number of columns processed by each thread */
-    unsigned int _m0{ 0 };                             /**< Number of rows processed by each thread */
+    bool         _export_input_to_cl_image{false};   /**< Export input to cl_image */
+    bool         _export_weights_to_cl_image{false}; /**< Export the weights to cl_image */
+    bool         _fast_relaxed_math{true};           /**< Enable/disable -cl-fast-relaxed-math flag */
+    bool         _is_fma_available{false};           /**< Is fma instruction available */
+    unsigned int _n0{0};                             /**< Number of columns processed by each thread */
+    unsigned int _m0{0};                             /**< Number of rows processed by each thread */
 };
 
 /** Forward declaration */
@@ -127,22 +129,20 @@
      * |F16            |F16            |F16            |F16            |
      * |F32            |F32            |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
      * Similar to @ref ClComponentDepthwiseConv2d::validate()
      */
-    ClComponentDepthwiseConv2d(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    ClComponentDepthwiseConv2d(ComponentId                      id,
+                               const Properties                &properties,
+                               const ArgumentPack<ITensorInfo> &tensors,
+                               const Attributes                &attributes,
+                               const Settings                  &settings);
 
     /** Destructor */
     ~ClComponentDepthwiseConv2d() override;

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
index a713c82..98f3d6a 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.cpp

@@ -23,8 +23,8 @@
  */
 #include "ClComponentDirectConv2d.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
 
 #include "src/core/CL/CLValidate.h"
@@ -57,7 +57,8 @@
     return _fast_relaxed_math;
 }
 
-ClComponentDirectConv2dSettings &ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc)
+ClComponentDirectConv2dSettings &
+ClComponentDirectConv2dSettings::direct_conv_descriptor(const DirectConvComputeKernelInfo &desc)
 {
     _desc = desc;
     return *this;
@@ -68,11 +69,10 @@
     return _desc;
 }
 
-Status ClComponentDirectConv2d::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentDirectConv2d::validate(const Properties                &properties,
+                                         const ArgumentPack<ITensorInfo> &tensors,
+                                         const Attributes                &attributes,
+                                         const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
@@ -86,7 +86,7 @@
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bia);
     }
@@ -94,7 +94,7 @@
     // Matching data layout
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, bia);
     }
@@ -103,7 +103,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(wei->tensor_shape().total_size() == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(bia->tensor_shape().total_size() == 0);
     }
@@ -112,22 +112,23 @@
     // wei shape is correct
     const DataLayout data_layout = src->data_layout();
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(wei->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
     // dst shape is correct
-    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left, attributes.pad().right, attributes.pad().top,
-                                    attributes.pad().bottom, DimensionRoundingType{});
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                       misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
+    PadStrideInfo legacy_pad_stride(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                    attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                                    DimensionRoundingType{});
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, legacy_pad_stride));
 
     // bia shape is correct
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->dimension(0) != wei->dimension(3),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bia->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // 2. Check support level
@@ -137,24 +138,25 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
 
     const auto desc = settings.direct_conv_descriptor();
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                        desc.n0 != 16,
                                     "N0 can only be: 1, 2, 3, 4, 8, and 16");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                        desc.k0 != 16,
                                     "K0 can only be: 1, 2, 3, 4, 8, and 16");
     return Status{};
 }
 
-ClComponentDirectConv2d::ClComponentDirectConv2d(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentDirectConv2d::ClComponentDirectConv2d(ComponentId                      id,
+                                                 const Properties                &properties,
+                                                 const ArgumentPack<ITensorInfo> &tensors,
+                                                 const Attributes                &attributes,
+                                                 const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings) }
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings) }
+      _component_writer{std::make_unique<ClTemplateDirectConv2d>(id, tensors, attributes, settings)}
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<GpuCkwDirectConv2d>(id, tensors, attributes, settings)}
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
@@ -165,7 +167,7 @@
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentDirectConv2d::template_writer() const
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentDirectConv2d::ckw_component_driver() const
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
index 24acb1b..d6d9705 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h

@@ -26,7 +26,9 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -61,7 +63,7 @@
     DirectConvComputeKernelInfo direct_conv_descriptor() const;
 
 private:
-    bool                        _fast_relaxed_math{ true };
+    bool                        _fast_relaxed_math{true};
     DirectConvComputeKernelInfo _desc{}; // Direct convolution descriptor
 };
 
@@ -111,22 +113,20 @@
      * |F16            |F16            |F16            |F16            |
      * |F32            |F32            |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
      * Similar to @ref ClComponentDirectConv2d::validate()
      */
-    ClComponentDirectConv2d(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    ClComponentDirectConv2d(ComponentId                      id,
+                            const Properties                &properties,
+                            const ArgumentPack<ITensorInfo> &tensors,
+                            const Attributes                &attributes,
+                            const Settings                  &settings);
 
     /** Destructor */
     ~ClComponentDirectConv2d() override;
@@ -142,7 +142,7 @@
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
 #else  // ACL_INTERNAL_TEST_CKW_IN_DF
-    const IGpuCkwComponentDriver *ckw_component_driver() const override;
+    const IGpuCkwComponentDriver       *ckw_component_driver() const override;
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
     /** Get component type */
     GpuComponentType type() const override

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
index 88d7291..5b13642 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.cpp

@@ -24,6 +24,7 @@
 #include "ClComponentElementwiseBinary.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h"
@@ -39,56 +40,55 @@
 {
 namespace
 {
-std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops
-{
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::Add,
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub,
-    ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul
-};
+std::set<ElementwiseBinaryCommonAttributes::ElementwiseOp> supported_ops{
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Add, ElementwiseBinaryCommonAttributes::ElementwiseOp::Sub,
+    ElementwiseBinaryCommonAttributes::ElementwiseOp::Mul};
 }
 
-Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo> &tensors, const ElementwiseBinaryCommonAttributes &attributes)
+Status ClComponentElementwiseBinary::validate(const ArgumentPack<ITensorInfo>         &tensors,
+                                              const ElementwiseBinaryCommonAttributes &attributes)
 {
     const auto lhs = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto rhs = tensors.get_const_tensor(TensorType::ACL_SRC_1);
     const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
 
     // Check operator type
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(), "Provided Elementwise operation not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(supported_ops.find(attributes.operation()) == supported_ops.end(),
+                                    "Provided Elementwise operation not supported.");
 
     // Check validity
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
 
     //Check data type for different elementwise operators
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32, DataType::S16, DataType::U8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16, DataType::S32,
+                                                         DataType::S16, DataType::U8);
 
     // dst shape is correct
     const TensorShape out_shape = TensorShape::broadcast_shape(lhs->tensor_shape(), rhs->tensor_shape());
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                    "Wrong shape for dst.");
 
     const auto &lhs_shape = lhs->tensor_shape();
     const auto &rhs_shape = rhs->tensor_shape();
     const auto &dst_shape = dst->tensor_shape();
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(lhs_shape, dst_shape, 0) && detail::have_different_dimensions(rhs_shape, dst_shape, 0),
-        "Only LHS or RHS can be broadcasting, not both.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 0) &&
+                                        detail::have_different_dimensions(rhs_shape, dst_shape, 0),
+                                    "Only LHS or RHS can be broadcasting, not both.");
 
     // Dimension Y and Z are collapsed together in the current kernel implementation,
     // hence they cannot be independently broadcast or non-broadcast.
     // See: ClTemplateElementwiseBinary::get_window
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        (lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) != (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]),
-        "Dimension Y and Z must both be either broadcast or non-broadcast.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_shape[1] != dst_shape[1] || rhs_shape[1] != dst_shape[1]) !=
+                                        (lhs_shape[2] != dst_shape[2] || rhs_shape[2] != dst_shape[2]),
+                                    "Dimension Y and Z must both be either broadcast or non-broadcast.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(lhs_shape, dst_shape, 3),
-        "LHS broadcast in dimension 3 or higher is not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(lhs_shape, dst_shape, 3),
+                                    "LHS broadcast in dimension 3 or higher is not supported.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-        detail::have_different_dimensions(rhs_shape, dst_shape, 3),
-        "RHS broadcast in dimension 3 or higher is not supported.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(rhs_shape, dst_shape, 3),
+                                    "RHS broadcast in dimension 3 or higher is not supported.");
 
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
@@ -112,22 +112,15 @@
 ClComponentElementwiseBinary::~ClComponentElementwiseBinary()
 {
 }
-ClComponentElementwiseBinary::ClComponentElementwiseBinary(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentElementwiseBinary::ClComponentElementwiseBinary(ComponentId                      id,
+                                                           const Properties                &properties,
+                                                           const ArgumentPack<ITensorInfo> &tensors,
+                                                           const Attributes                &attributes)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<ClTemplateElementwiseBinary>(id, tensors, attributes)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)
-}
+      _component_writer{std::make_unique<GpuCkwElementwiseBinary>(id, tensors, attributes)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
index f717590..7589b97 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h

@@ -82,17 +82,17 @@
      * |S16            |S16            |S16            |
      * |U8             |U8             |U8             |
      */
-    static Status validate(const ArgumentPack<ITensorInfo> &tensors, const ElementwiseBinaryCommonAttributes &attributes);
+    static Status validate(const ArgumentPack<ITensorInfo>         &tensors,
+                           const ElementwiseBinaryCommonAttributes &attributes);
 
     /** Constructor
      *
      * Similar to @ref ClComponentElementwiseBinary::validate()
      */
-    ClComponentElementwiseBinary(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    ClComponentElementwiseBinary(ComponentId                      id,
+                                 const Properties                &properties,
+                                 const ArgumentPack<ITensorInfo> &tensors,
+                                 const Attributes                &attributes);
 
     /** Destructor */
     ~ClComponentElementwiseBinary() override;

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
index 279c77e..27c13bd 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.cpp

@@ -25,9 +25,10 @@
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h"
 
@@ -37,10 +38,9 @@
 {
 namespace dynamic_fusion
 {
-Status ClComponentLogits1DMaxShiftExpSum::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes)
+Status ClComponentLogits1DMaxShiftExpSum::validate(const Properties                &properties,
+                                                   const ArgumentPack<ITensorInfo> &tensors,
+                                                   const Attributes                &attributes)
 {
     ARM_COMPUTE_UNUSED(properties, attributes);
 
@@ -75,8 +75,8 @@
                                                                      const Properties                &properties,
                                                                      const ArgumentPack<ITensorInfo> &tensors,
                                                                      const Attributes                &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateLogits1DMaxShiftExpSum>(id, tensors, attributes) }
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<ClTemplateLogits1DMaxShiftExpSum>(id, tensors, attributes)}
 {
 }
 

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
index b5db458..91ab5de 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DMAXSHIFTEXPSUM
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -89,10 +90,8 @@
      * |F16        | F16       | F16       |
      * |F32        | F32       | F32       |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
index 7864d56..fb25443 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.cpp

@@ -25,9 +25,10 @@
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
 
@@ -37,10 +38,9 @@
 {
 namespace dynamic_fusion
 {
-Status ClComponentLogits1DNorm::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes)
+Status ClComponentLogits1DNorm::validate(const Properties                &properties,
+                                         const ArgumentPack<ITensorInfo> &tensors,
+                                         const Attributes                &attributes)
 {
     ARM_COMPUTE_UNUSED(properties, attributes);
 
@@ -77,8 +77,8 @@
                                                  const Properties                &properties,
                                                  const ArgumentPack<ITensorInfo> &tensors,
                                                  const Attributes                &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateLogits1DNorm>(id, tensors, attributes) }
+    : IGpuKernelComponent{id, properties, tensors},
+      _component_writer{std::make_unique<ClTemplateLogits1DNorm>(id, tensors, attributes)}
 {
 }
 

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
index 5bd350b..74c0273 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTLOGITS1DNORM
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/SoftmaxAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -86,10 +87,8 @@
      * |F16        | F16       | F16       |
      * |F32        | F32       | F32       |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
index d415769..409b191 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.cpp

@@ -24,13 +24,15 @@
 #include "ClComponentPool2d.h"
 
 #include "arm_compute/core/Error.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/ckw_driver/components/GpuCkwPool2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h"
 #include "src/dynamic_fusion/utils/Utils.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -39,23 +41,24 @@
 {
 namespace dynamic_fusion
 {
-Status ClComponentPool2d::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
+Status ClComponentPool2d::validate(const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors,
+                                   const Attributes                &attributes,
+                                   const Settings                  &settings)
 {
     ARM_COMPUTE_UNUSED(properties);
     const auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
     const auto dst = tensors.get_const_tensor(TensorType::ACL_DST_0);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX), "Unsupported Pooling type");
+    ARM_COMPUTE_ERROR_ON_MSG((attributes.pool_type() != PoolingType::AVG && attributes.pool_type() != PoolingType::MAX),
+                             "Unsupported Pooling type");
 
     // 1. Check validity
     // Check if pooling is valid
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())),
-                                    "Pooling region that is entirely outside input tensor is unsupported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        is_pool_region_entirely_outside_input(convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())),
+        "Pooling region that is entirely outside input tensor is unsupported");
 
     // Matching data type
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -70,8 +73,9 @@
     // Device requirements are met
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                       misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        dst->tensor_shape(), misc::shape_calculator::compute_pool_shape(
+                                 *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision())));
 
     // 2. Check support level
     // Data type
@@ -83,23 +87,16 @@
     return Status{};
 }
 
-ClComponentPool2d::ClComponentPool2d(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors,
-    const Attributes                &attributes,
-    const Settings                  &settings)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentPool2d::ClComponentPool2d(ComponentId                      id,
+                                     const Properties                &properties,
+                                     const ArgumentPack<ITensorInfo> &tensors,
+                                     const Attributes                &attributes,
+                                     const Settings                  &settings)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplatePool2d>(id, tensors, attributes, settings)
-}
+      _component_writer{std::make_unique<ClTemplatePool2d>(id, tensors, attributes, settings)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)
-}
+      _component_writer{std::make_unique<GpuCkwPool2d>(id, tensors, attributes, settings)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
index 6814bf9..98fed65 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTPOOL2D_H
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -82,11 +83,10 @@
      * |F16            |F16            |
      * |F32            |F32            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    static Status validate(const Properties                &properties,
+                           const ArgumentPack<ITensorInfo> &tensors,
+                           const Attributes                &attributes,
+                           const Settings                  &settings);
 
     /** Constructor
      *
@@ -96,12 +96,11 @@
      * @param[in]     attributes Component attributes
      * @param[in]     settings   Component settings
      */
-    ClComponentPool2d(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes,
-        const Settings                  &settings);
+    ClComponentPool2d(ComponentId                      id,
+                      const Properties                &properties,
+                      const ArgumentPack<ITensorInfo> &tensors,
+                      const Attributes                &attributes,
+                      const Settings                  &settings);
 
     /** Destructor */
     ~ClComponentPool2d() override;

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
index 66e2ee6..0ece9de 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "ClComponentReshape.h"
+
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h"
 
@@ -49,12 +51,10 @@
     return Status{};
 }
 
-ClComponentReshape::ClComponentReshape(
-    ComponentId                      id,
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{ id, properties, tensors },
-      _component_writer{ std::make_unique<ClTemplateReshape>(id, tensors) }
+ClComponentReshape::ClComponentReshape(ComponentId                      id,
+                                       const Properties                &properties,
+                                       const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{id, properties, tensors}, _component_writer{std::make_unique<ClTemplateReshape>(id, tensors)}
 {
 }
 ClComponentReshape::~ClComponentReshape()

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
index f8d165b..78163d6 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h

@@ -73,10 +73,7 @@
      * @param[in] properties Component properties @ref Properties
      * @param[in] tensors    Tensor arguments to the component
      */
-    ClComponentReshape(
-        ComponentId                      id,
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors);
+    ClComponentReshape(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
 
     /** Destructor */
     ~ClComponentReshape() override;

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
index 6df1d9b..b05eb04 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.cpp

@@ -66,7 +66,9 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
     // Align corners and sampling policy conformance
-    ARM_COMPUTE_RETURN_ERROR_ON(attributes.align_corners() && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy()));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        attributes.align_corners() &&
+        !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(attributes.sampling_policy()));
 
     // All tensor infos are initialized
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() == 0);
@@ -79,11 +81,11 @@
                                      const IGpuKernelComponent::Properties &properties,
                                      const ArgumentPack<ITensorInfo>       &tensors,
                                      const ClComponentResize::Attributes   &attributes)
-    : IGpuKernelComponent{ id, properties, tensors },
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<ClTemplateResize>(id, tensors, attributes) }
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer{ std::make_unique<GpuCkwResize>(id, tensors, attributes) }
+      _component_writer{std::make_unique<ClTemplateResize>(id, tensors, attributes)}
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
+      _component_writer{std::make_unique<GpuCkwResize>(id, tensors, attributes)}
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }
@@ -94,7 +96,7 @@
 
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuTemplateComponentWriter *ClComponentResize::template_writer() const
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 const IGpuCkwComponentDriver *ClComponentResize::ckw_component_driver() const
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 {

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
index 474524f..29276c3 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h

@@ -26,6 +26,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTRESIZE
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/ResizeAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 
 namespace arm_compute
@@ -43,7 +44,7 @@
 /** Forward declaration */
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
 class ClTemplateResize;
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
 class GpuCkwResize;
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
@@ -82,10 +83,8 @@
      * |U8             |U8             |
      * |S16            |S16            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors,
-        const Attributes                &attributes);
+    static Status
+    validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
 
     /** Constructor
      *
@@ -114,7 +113,7 @@
     /** Get writer for the component */
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuTemplateComponentWriter *template_writer() const override;
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     const IGpuCkwComponentDriver *ckw_component_driver() const override;
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 
@@ -127,7 +126,7 @@
 private:
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<ClTemplateResize> _component_writer;
-#else // ACL_INTERNAL_TEST_CKW_IN_DF
+#else  // ACL_INTERNAL_TEST_CKW_IN_DF
     std::unique_ptr<GpuCkwResize> _component_writer;
 #endif // ACL_INTERNAL_TEST_CKW_IN_DF
 };

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
index 12b81c3..dcbecaf 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.cpp

@@ -38,25 +38,19 @@
 {
 namespace dynamic_fusion
 {
-Status ClComponentStore::validate(
-    const Properties                &properties,
-    const ArgumentPack<ITensorInfo> &tensors)
+Status ClComponentStore::validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
 {
     ARM_COMPUTE_UNUSED(properties, tensors);
     return Status{};
 }
-ClComponentStore::ClComponentStore(ComponentId id, const Properties &properties, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuKernelComponent{ id, properties, tensors },
+ClComponentStore::ClComponentStore(ComponentId                      id,
+                                   const Properties                &properties,
+                                   const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuKernelComponent{id, properties, tensors},
 #ifndef ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<ClTemplateStore>(id, tensors)
-}
+      _component_writer{std::make_unique<ClTemplateStore>(id, tensors)}
 #else  //ACL_INTERNAL_TEST_CKW_IN_DF
-      _component_writer
-{
-    std::make_unique<GpuCkwStore>(id, tensors)
-}
+      _component_writer{std::make_unique<GpuCkwStore>(id, tensors)}
 #endif //ACL_INTERNAL_TEST_CKW_IN_DF
 {
 }

diff --git a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
index 853ee39..948785c 100644
--- a/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h
+++ b/src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_COMPONENTS_CL_CLCOMPONENTSTORE
 
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
+
 #include <memory>
 
 namespace arm_compute
@@ -70,9 +71,7 @@
      * |:--------------|:--------------|
      * |All            |All            |
      */
-    static Status validate(
-        const Properties                &properties,
-        const ArgumentPack<ITensorInfo> &tensors);
+    static Status validate(const Properties &properties, const ArgumentPack<ITensorInfo> &tensors);
     /** Constructor
      *
      * Similar to @ref ClComponentStore::validate()

diff --git a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
index bc7133f..4c3e84e 100644
--- a/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/components/utils/type_printer/ElementwiseBinary.h

@@ -46,18 +46,16 @@
  */
 inline ::std::ostream &operator<<(::std::ostream &os, const ClComponentElementwiseBinary::Attributes::ElementwiseOp &op)
 {
-    const std::map<ClComponentElementwiseBinary::Attributes::ElementwiseOp, std::string> op_name =
-    {
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff" },
-        { ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub" }
-    };
+    const std::map<ClComponentElementwiseBinary::Attributes::ElementwiseOp, std::string> op_name = {
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Add, "add"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Div, "div"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Max, "max"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Min, "min"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Mul, "mul"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Power, "power"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Prelu, "prelu"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::SquaredDiff, "squareddiff"},
+        {ClComponentElementwiseBinary::Attributes::ElementwiseOp::Sub, "sub"}};
     os << op_name.at(op);
     return os;
 }

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
index e7ee1c1..2cec67d 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuAdd.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuAdd.h"
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -32,12 +33,11 @@
 {
 namespace dynamic_fusion
 {
-Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch,
-                           const ITensorInfo       *lhs,
-                           const ITensorInfo       *rhs)
+Status GpuAdd::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Add then call the elementwise common validate_op
@@ -46,12 +46,11 @@
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
-Status GpuAdd::is_supported_op(const GpuWorkloadContext &context,
-                               const ITensorInfo        *lhs,
-                               const ITensorInfo        *rhs)
+Status GpuAdd::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Add then call the elementwise common is_supported_op
@@ -60,9 +59,7 @@
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
-ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch,
-                               ITensorInfo       *lhs,
-                               ITensorInfo       *rhs)
+ITensorInfo *GpuAdd::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
 {
     // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
     // Set the elementwise operation to Add then call the elementwise common create_op

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
index 33c2d43..6f35e66 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuCast.cpp

@@ -23,12 +23,11 @@
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuCast.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentCast.h"
-
-#include "src/common/utils/Log.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -49,7 +48,7 @@
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -58,25 +57,22 @@
 
     // Check support level
     // Data Type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+        src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
+        DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst_info_to_validate_ptr, 1, DataType::U8, DataType::S8,
+                                                         DataType::QASYMM8, DataType::S16, DataType::U16, DataType::U32,
+                                                         DataType::S32, DataType::F16, DataType::F32);
 
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
         // Validate Cast Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentCast::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentCast::Settings();
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -94,16 +90,13 @@
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuCast::is_supported_op(const GpuWorkloadContext &context,
-                                const ITensorInfo        *src,
-                                const CastAttributes     &attributes)
+Status
+GpuCast::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const CastAttributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuCast::validate_op(const GpuWorkloadSketch &sketch,
-                            const ITensorInfo       *src,
-                            const CastAttributes    &attributes)
+Status GpuCast::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const CastAttributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -127,9 +120,7 @@
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuCast::create_op(GpuWorkloadSketch    &sketch,
-                                ITensorInfo          *src,
-                                const CastAttributes &attributes)
+ITensorInfo *GpuCast::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const CastAttributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes);
@@ -145,14 +136,15 @@
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
     const auto              *sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
 
         // Add Depthwise Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentCast::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentCast::Settings();
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
index 89b533c..697b7d4 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuClamp.cpp

@@ -25,14 +25,13 @@
 
 #include "arm_compute/core/experimental/Types.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace experimental
@@ -48,12 +47,13 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(), "Maximum clamp value cannot be lower than minimum value");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.max_val() < attributes.min_val(),
+                                    "Maximum clamp value cannot be lower than minimum value");
 
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -61,16 +61,15 @@
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
     // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
-    const ClComponentActivation::Attributes act_info
-    {
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val()
-    };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                     attributes.max_val(), attributes.min_val()};
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         // Validate Activation Component
-        const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
@@ -87,16 +86,13 @@
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuClamp::is_supported_op(const GpuWorkloadContext &context,
-                                 const ITensorInfo        *src,
-                                 const ClampAttributes    &attributes)
+Status
+GpuClamp::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ClampAttributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch,
-                             const ITensorInfo       *src,
-                             const ClampAttributes   &attributes)
+Status GpuClamp::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ClampAttributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -121,9 +117,7 @@
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch     &sketch,
-                                 ITensorInfo           *src,
-                                 const ClampAttributes &attributes)
+ITensorInfo *GpuClamp::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const ClampAttributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes);
@@ -139,18 +133,16 @@
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
 
     // CLAMP operator is implemented as LU_BOUNDED_RELU with the alpha and beta variables swapped
-    const ClComponentActivation::Attributes act_info
-    {
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, attributes.max_val(), attributes.min_val()
-    };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                     attributes.max_val(), attributes.min_val()};
 
     const auto *const sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         // Add Activation Component
         auto properties = IGpuKernelComponent::Properties();
-        properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
index cb270ed..aaeec54 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuConv2d.cpp

@@ -24,15 +24,15 @@
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h"
 
 #include "arm_compute/core/KernelDescriptors.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
 #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
@@ -45,24 +45,30 @@
 {
 namespace
 {
-DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
 
-    std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t = arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target);
+    std::unique_ptr<arm_compute::cl_direct_conv::IClDirectConvKernelConfig> t =
+        arm_compute::cl_direct_conv::ClDirectConvKernelConfigurationFactory::create(gpu_target);
 
     return t->configure(src, weights, conv_info);
 }
 
-void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const Conv2dAttributes &attributes)
+void calculate_and_init_dst_if_empty(ITensorInfo            *dst,
+                                     const ITensorInfo      *src,
+                                     const ITensorInfo      *wei,
+                                     const Conv2dAttributes &attributes)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        const auto shape = misc::shape_calculator::compute_deep_convolution_shape(src->tensor_shape(), src->data_layout(), wei->tensor_shape(),
-                                                                                  PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                                                                                                attributes.pad().right,
-                                                                                                attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
+        const auto shape = misc::shape_calculator::compute_deep_convolution_shape(
+            src->tensor_shape(), src->data_layout(), wei->tensor_shape(),
+            PadStrideInfo(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                          attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                          DimensionRoundingType::FLOOR)); // use the default DimensionRoundingType
 
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
@@ -83,7 +89,7 @@
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -98,18 +104,20 @@
 
     // Check components
     const auto gpu_target = context.gpu_target();
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
         // Validate Direct Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentDirectConv2d::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDirectConv2d::Settings();
 
             settings.fast_relaxed_math(
-                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-                && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16));
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+                 dst_info_to_validate_ptr->data_type() == DataType::F16));
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -142,14 +150,14 @@
                               const ITensorInfo       *src,
                               const ITensorInfo       *wei,
                               const ITensorInfo       *bia,
-                              const Conv2dAttributes &attributes)
+                              const Conv2dAttributes  &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, wei);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!wei->are_values_constant(), "Dynamic weights are not supported");
 
     // Check if tensors have valid id. I.e. they are created from a sketch
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
     }
@@ -178,16 +186,13 @@
     return is_supported_op_helper(*sketch.gpu_context(), src, wei, bia, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuConv2d::create_op(GpuWorkloadSketch      &sketch,
-                                  ITensorInfo            *src,
-                                  ITensorInfo            *wei,
-                                  ITensorInfo            *bia,
-                                  const Conv2dAttributes &attributes)
+ITensorInfo *GpuConv2d::create_op(
+    GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *wei, ITensorInfo *bia, const Conv2dAttributes &attributes)
 {
     ARM_COMPUTE_LOG_PARAMS(src, wei, bia, attributes);
     PadStrideInfo conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                            attributes.pad().right,
-                            attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+                            attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
+                            DimensionRoundingType::FLOOR);
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, wei, conv_info);
 
@@ -207,7 +212,7 @@
 
     const auto gpu_target = sketch_ctx->gpu_target();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_ERROR_ON(cl_compile_ctx == nullptr);
@@ -216,17 +221,17 @@
         // Add Direct Conv2d Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             auto settings = ClComponentDirectConv2d::Settings();
 
             settings.fast_relaxed_math(
-                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-                && (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst->data_type() == DataType::F32 || dst->data_type() == DataType::F16));
 
             settings.direct_conv_descriptor(desc);
 
-            if(settings.export_to_cl_image())
+            if (settings.export_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
             }

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
index c72098e..e2b673b 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuDepthwiseConv2d.cpp

@@ -28,8 +28,8 @@
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
 #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
@@ -42,20 +42,20 @@
 {
 namespace
 {
-void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ITensorInfo *wei, const DepthwiseConv2dAttributes &attributes)
+void calculate_and_init_dst_if_empty(ITensorInfo                     *dst,
+                                     const ITensorInfo               *src,
+                                     const ITensorInfo               *wei,
+                                     const DepthwiseConv2dAttributes &attributes)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        const PadStrideInfo pad_stride_info(attributes.stride().x(),
-                                            attributes.stride().y(),
-                                            attributes.pad().left,
-                                            attributes.pad().right,
-                                            attributes.pad().top,
-                                            attributes.pad().bottom,
+        const PadStrideInfo pad_stride_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
+                                            attributes.pad().right, attributes.pad().top, attributes.pad().bottom,
                                             attributes.dimension_rounding_type());
 
-        const ConvolutionInfo conv_info{ pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(), attributes.dilation() };
-        const TensorShape     shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
+        const ConvolutionInfo conv_info{pad_stride_info, attributes.depth_multiplier(), ActivationLayerInfo(),
+                                        attributes.dilation()};
+        const TensorShape shape = misc::shape_calculator::compute_depthwise_convolution_shape(*src, *wei, conv_info);
 
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
@@ -76,7 +76,7 @@
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -91,40 +91,44 @@
 
     const GpuTarget gpu_target = context.gpu_target();
 
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const CLCompileContext *cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
 
         // Validate Depthwise Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentDepthwiseConv2d::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDepthwiseConv2d::Settings();
 
-            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                                                 attributes.pad().right,
-                                                 attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+                                                 attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                                 attributes.pad().bottom, DimensionRoundingType::FLOOR);
 
             // Get the depthwise convolution compute parameters
-            auto                       t        = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-            const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info =
+                t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
 
             settings.fast_relaxed_math(
-                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-                && (dst_info_to_validate_ptr->data_type() == DataType::F32 || dst_info_to_validate_ptr->data_type() == DataType::F16));
+                (gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+                (dst_info_to_validate_ptr->data_type() == DataType::F32 ||
+                 dst_info_to_validate_ptr->data_type() == DataType::F16));
 
             settings.is_fma_available(get_arch_from_target(gpu_target) == GPUTarget::MIDGARD)
-            .m0(dwc_info.m0)
-            .n0(dwc_info.n0)
-            .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
-            .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+                .m0(dwc_info.m0)
+                .n0(dwc_info.n0)
+                .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+                .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
             arguments.add_const_tensor(ACL_SRC_1, wei);
             arguments.add_const_tensor(ACL_SRC_2, bia);
             arguments.add_const_tensor(ACL_DST_0, dst_info_to_validate_ptr);
-            ARM_COMPUTE_RETURN_ON_ERROR(ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClComponentDepthwiseConv2d::validate(properties, arguments, attributes, settings));
         }
     }
     else
@@ -158,7 +162,7 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !wei->has_valid_id());
 
-    if(bia != nullptr)
+    if (bia != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!bia->has_valid_id());
     }
@@ -205,35 +209,37 @@
     const auto              *sketch_ctx = sketch.implementation().context();
     const GpuTarget          gpu_target = sketch_ctx->gpu_target();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
 
         // Add Depthwise Conv2d Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
-            auto       settings   = ClComponentDepthwiseConv2d::Settings();
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
+            auto settings = ClComponentDepthwiseConv2d::Settings();
 
-            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(), attributes.pad().left,
-                                                 attributes.pad().right,
-                                                 attributes.pad().top, attributes.pad().bottom, DimensionRoundingType::FLOOR);
+            const PadStrideInfo legacy_conv_info(attributes.stride().x(), attributes.stride().y(),
+                                                 attributes.pad().left, attributes.pad().right, attributes.pad().top,
+                                                 attributes.pad().bottom, DimensionRoundingType::FLOOR);
 
             // Get the depthwise convolution compute parameters
-            auto                       t        = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-            const DWCComputeKernelInfo dwc_info = t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
+            auto t = arm_compute::cl_dwc::ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+            const DWCComputeKernelInfo dwc_info =
+                t->configure(src, wei, legacy_conv_info, attributes.dilation(), attributes.depth_multiplier());
 
             settings.is_fma_available(get_arch_from_target(gpu_target) != GPUTarget::MIDGARD)
-            .m0(dwc_info.m0)
-            .n0(dwc_info.n0)
-            .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
-            .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
+                .m0(dwc_info.m0)
+                .n0(dwc_info.n0)
+                .export_input_to_cl_image(dwc_info.export_input_to_cl_image)
+                .export_weights_to_cl_image(dwc_info.export_weights_to_cl_image);
 
-            if(settings.export_input_to_cl_image())
+            if (settings.export_input_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(src);
             }
 
-            if(settings.export_weights_to_cl_image())
+            if (settings.export_weights_to_cl_image())
             {
                 arm_compute::opencl::kernels::gemm::update_padding_for_cl_image(wei);
             }

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
index 464a32c..b871171 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuMul.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuMul.h"
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -32,9 +33,7 @@
 {
 namespace dynamic_fusion
 {
-Status GpuMul::validate_op(const GpuWorkloadSketch &sketch,
-                           const ITensorInfo       *lhs,
-                           const ITensorInfo       *rhs)
+Status GpuMul::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
@@ -46,9 +45,7 @@
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
-Status GpuMul::is_supported_op(const GpuWorkloadContext &context,
-                               const ITensorInfo        *lhs,
-                               const ITensorInfo        *rhs)
+Status GpuMul::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32);
@@ -60,9 +57,7 @@
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
-ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch,
-                               ITensorInfo       *lhs,
-                               ITensorInfo       *rhs)
+ITensorInfo *GpuMul::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
 {
     // Set the elementwise operation to Mul then call the elementwise common create_op
     ElementwiseBinaryCommonAttributes common_attributes{};

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
index 107a5e5..f0d368d 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuOutput.cpp

@@ -26,10 +26,9 @@
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
-
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentStore.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/utils/Utils.h"
 
 namespace arm_compute
@@ -43,9 +42,7 @@
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuOutput::is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const ITensorInfo        *dst)
+Status GpuOutput::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
@@ -60,9 +57,7 @@
     return Status{};
 }
 
-Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch,
-                              const ITensorInfo       *src,
-                              const ITensorInfo       *dst)
+Status GpuOutput::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -90,9 +85,7 @@
     return status;
 }
 
-void GpuOutput::create_op(GpuWorkloadSketch &sketch,
-                          ITensorInfo       *src,
-                          ITensorInfo       *dst)
+void GpuOutput::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(GpuOutput::validate_op(sketch, src, dst));
@@ -104,14 +97,14 @@
     auto      &comp_graph = sketch.implementation().component_graph();
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON(sketch_ctx->cl_compile_context() == nullptr);
 
         // Add store component
         {
             IGpuKernelComponent::Properties properties;
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
index 7ecfa01..55c604a 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuPool2d.cpp

@@ -22,20 +22,21 @@
  * SOFTWARE.
  */
 
+#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "arm_compute/core/CL/CLCompileContext.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h"
-
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
-#include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
 #include "src/dynamic_fusion/utils/Utils.h"
 
 namespace arm_compute
@@ -46,11 +47,15 @@
 {
 namespace
 {
-void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const Pool2dAttributes &attributes, const GpuPool2dSettings &settings)
+void calculate_and_init_dst_if_empty(ITensorInfo             *dst,
+                                     const ITensorInfo       *src,
+                                     const Pool2dAttributes  &attributes,
+                                     const GpuPool2dSettings &settings)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        auto shape = misc::shape_calculator::compute_pool_shape(*src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
+        auto shape = misc::shape_calculator::compute_pool_shape(
+            *src, convert_pool_attr_to_pool_info(attributes, settings.mixed_precision()));
         auto_init_if_empty(*dst, src->clone()->set_tensor_shape(shape));
     }
 }
@@ -82,7 +87,7 @@
 
 Status GpuPool2d::validate_op(const GpuWorkloadSketch &sketch,
                               const ITensorInfo       *src,
-                              const Pool2dAttributes &attributes,
+                              const Pool2dAttributes  &attributes,
                               const GpuPool2dSettings &settings)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
@@ -110,7 +115,7 @@
 Status GpuPool2d::is_supported_op(const GpuWorkloadContext &context,
                                   const ITensorInfo        *src,
                                   const Pool2dAttributes   &attributes,
-                                  const GpuPool2dSettings &settings)
+                                  const GpuPool2dSettings  &settings)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     // Data type
@@ -118,7 +123,8 @@
     // Data layout
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     // Check exclude padding is not false
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(), "Exclude padding must be set to true in Attributes!");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!attributes.exclude_padding(),
+                                    "Exclude padding must be set to true in Attributes!");
 
     // Auto initialize dst tensor info
     TensorInfo dst_info_to_validate;
@@ -126,14 +132,15 @@
     calculate_and_init_dst_if_empty(&dst_info_to_validate, src, attributes, settings);
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
 
         // Validate Component
         {
-            const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            const KernelProperties properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -148,10 +155,10 @@
     return Status{};
 }
 
-ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch  &sketch,
-                          ITensorInfo                *src,
-                          const Pool2dAttributes     &attributes,
-                          const GpuPool2dSettings    &settings)
+ITensorInfo *GpuPool2d::create_op(GpuWorkloadSketch       &sketch,
+                                  ITensorInfo             *src,
+                                  const Pool2dAttributes  &attributes,
+                                  const GpuPool2dSettings &settings)
 {
     // Assert validation
     ARM_COMPUTE_ERROR_THROW_ON(GpuPool2d::validate_op(sketch, src, attributes, settings));
@@ -168,7 +175,7 @@
 
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_UNUSED(cl_compile_ctx);
@@ -177,7 +184,7 @@
         // Add Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
index 0f43a57..3def7a1 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuReshape.cpp

@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuReshape.h"
+
 #include "arm_compute/core/Error.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -40,14 +42,14 @@
 Status is_supported_op_helper(const GpuWorkloadContext &context,
                               const ITensorInfo        *src,
                               const ITensorInfo        *dst,
-                              const ReshapeAttributes &attributes)
+                              const ReshapeAttributes  &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -55,7 +57,7 @@
     auto_init_if_empty(dst_info_to_validate, src->clone()->set_tensor_shape(attributes.shape()));
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
@@ -78,16 +80,13 @@
 GpuOperatorType operator_type = GpuOperatorType::Complex;
 } // namespace
 
-Status GpuReshape::is_supported_op(const GpuWorkloadContext &context,
-                                   const ITensorInfo        *src,
-                                   const Attributes         &attributes)
+Status
+GpuReshape::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch,
-                               const ITensorInfo       *src,
-                               const Attributes        &attributes)
+Status GpuReshape::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const Attributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -111,9 +110,7 @@
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch,
-                                   ITensorInfo       *src,
-                                   const Attributes &attributes)
+ITensorInfo *GpuReshape::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const Attributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes.shape());
@@ -127,7 +124,7 @@
     // Translate into components and add to component graph
     auto      &comp_graph = sketch.implementation().component_graph();
     const auto sketch_ctx = sketch.implementation().context();
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_UNUSED(cl_compile_ctx);
@@ -136,7 +133,7 @@
         // Add ElementwiseBinary Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
index 5f52eea..fb09875 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuResize.cpp

@@ -26,12 +26,12 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentResize.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -43,7 +43,7 @@
 {
 void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *src, const ResizeAttributes &attributes)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
         TensorShape out_shape = src->tensor_shape();
 
@@ -64,7 +64,7 @@
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -73,22 +73,25 @@
 
     // Check support level
     // Data type
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     // Data layout
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     // Interpolation policy
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR && attributes.interpolation_policy() != InterpolationPolicy::BILINEAR,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(attributes.interpolation_policy() != InterpolationPolicy::NEAREST_NEIGHBOR &&
+                                        attributes.interpolation_policy() != InterpolationPolicy::BILINEAR,
                                     "Interpolation policy must be NEAREST_NEIGHBOR or BILINEAR");
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
 
         // Validate Activation Component
         {
-            const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            const KernelProperties properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);
@@ -107,16 +110,14 @@
 constexpr GpuOperatorType operator_type = GpuOperatorType::Complex;
 } // namespace
 
-Status GpuResize::is_supported_op(const GpuWorkloadContext &context,
-                                  const ITensorInfo        *src,
-                                  const Attributes         &attributes)
+Status
+GpuResize::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src, const Attributes &attributes)
 {
     return is_supported_op_helper(context, src, nullptr, attributes);
 }
 
-Status GpuResize::validate_op(const GpuWorkloadSketch     &sketch,
-                              const ITensorInfo           *src,
-                              const GpuResize::Attributes &attributes)
+Status
+GpuResize::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src, const GpuResize::Attributes &attributes)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id());
@@ -141,9 +142,7 @@
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-ITensorInfo *GpuResize::create_op(GpuWorkloadSketch           &sketch,
-                                  ITensorInfo                 *src,
-                                  const GpuResize::Attributes &attributes)
+ITensorInfo *GpuResize::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, const GpuResize::Attributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, attributes);
@@ -159,13 +158,14 @@
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
     const auto              *sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
 
         // Add Resize Component
         {
-            const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            const auto properties =
+                IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, src);

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
index 09debad..a2260c8 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.cpp

@@ -23,14 +23,15 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSigmoid.h"
+
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -40,9 +41,7 @@
 {
 namespace
 {
-Status is_supported_op_helper(const GpuWorkloadContext &context,
-                              const ITensorInfo        *src,
-                              const ITensorInfo        *dst)
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
@@ -50,20 +49,21 @@
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
 
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         // Validate Activation Component
-        const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
@@ -80,14 +80,12 @@
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context,
-                                   const ITensorInfo        *src)
+Status GpuSigmoid::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
 {
     return is_supported_op_helper(context, src, nullptr);
 }
 
-Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch,
-                               const ITensorInfo       *src)
+Status GpuSigmoid::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -112,8 +110,7 @@
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
 }
 
-ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch,
-                                   ITensorInfo       *src)
+ITensorInfo *GpuSigmoid::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src);
@@ -128,15 +125,15 @@
     // Translate into components and add to component graph
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::LOGISTIC };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::LOGISTIC};
 
     const auto *const sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         // Add Activation Component
         auto properties = IGpuKernelComponent::Properties();
-        properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
index ffc4553..c87b282 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.cpp

@@ -22,13 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSoftmax.h"
+
 #include "arm_compute/core/Error.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DMaxShiftExpSum.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentLogits1DNorm.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuOperatorProperties.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
@@ -52,7 +53,7 @@
     TensorInfo dst_info_to_validate;
 
     // Auto initialize dst tensor info
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate = *dst;
     }
@@ -61,11 +62,12 @@
         auto_init_if_empty(dst_info_to_validate, *src->clone());
     }
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
-        const KernelProperties properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const KernelProperties properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         TensorShape logits_sum_shape = src->tensor_shape();
         TensorInfo  logits(src->clone()->set_tensor_shape(logits_sum_shape));
@@ -86,7 +88,8 @@
         arguments_norm.add_const_tensor(ACL_SRC_1, &sum);
         arguments_norm.add_const_tensor(ACL_DST_0, &dst_info_to_validate);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClComponentLogits1DMaxShiftExpSum::validate(properties, arguments_exp_sum, attributes));
         ARM_COMPUTE_RETURN_ON_ERROR(ClComponentLogits1DNorm::validate(properties, arguments_norm, attributes));
     }
     else
@@ -105,14 +108,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(!src->has_valid_id() || !dst->has_valid_id());
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 4, "Only up to 4 dimensions are supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast<int32_t>(-src->num_dimensions()) || static_cast<int32_t>(src->num_dimensions()) <= attributes.axis());
+    ARM_COMPUTE_RETURN_ERROR_ON(attributes.axis() < static_cast<int32_t>(-src->num_dimensions()) ||
+                                static_cast<int32_t>(src->num_dimensions()) <= attributes.axis());
 
     // Auto initialize dst tensor info
     TensorInfo dst_info_to_validate = *dst;
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
-    const size_t actual_axis   = static_cast<size_t>(wrap_around(attributes.axis(), static_cast<int32_t>(src->num_dimensions())));
-    const bool   needs_permute = actual_axis != 0;
+    const size_t actual_axis =
+        static_cast<size_t>(wrap_around(attributes.axis(), static_cast<int32_t>(src->num_dimensions())));
+    const bool needs_permute = actual_axis != 0;
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(needs_permute, "Dynamic fusion softmax on axis!=0 not supported yet.");
 
     // Perform fusion test and check if the operator meets the fusion constraints
@@ -128,17 +133,16 @@
     return is_supported_op(*sketch.gpu_context(), src, &dst_info_to_validate, attributes);
 }
 
-void GpuSoftmax::create_op(GpuWorkloadSketch &sketch,
-                           ITensorInfo       *src,
-                           ITensorInfo       *dst,
-                           const Attributes &attributes)
+void GpuSoftmax::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src, ITensorInfo *dst, const Attributes &attributes)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, dst, attributes);
     TensorShape  logits_sum_shape = src->tensor_shape();
-    ITensorInfo *logits           = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+    ITensorInfo *logits           = sketch.implementation().create_auxiliary_tensor(
+                  src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
     logits_sum_shape.set(0, 1);
-    ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
+    ITensorInfo *sum = sketch.implementation().create_auxiliary_tensor(
+        src->clone()->set_id(ITensorInfo::invalid_tensor_id).set_tensor_shape(logits_sum_shape));
 
     // Auto initialize dst tensor info and the auxiliary tensor infos as well
     auto_init_if_empty(*dst, *src->clone());
@@ -151,7 +155,7 @@
     auto      &comp_graph = sketch.implementation().component_graph();
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = sketch_ctx->cl_compile_context();
         ARM_COMPUTE_UNUSED(cl_compile_ctx);
@@ -160,7 +164,7 @@
         // Add Direct Conv2d Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments_exp_sum;
             ArgumentPack<ITensorInfo> arguments_norm;

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
index 8240008..e5d62c9 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuSub.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuSub.h"
+
 #include "arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h"
 
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
@@ -32,12 +33,11 @@
 {
 namespace dynamic_fusion
 {
-Status GpuSub::validate_op(const GpuWorkloadSketch &sketch,
-                           const ITensorInfo       *lhs,
-                           const ITensorInfo       *rhs)
+Status GpuSub::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common validate_op
@@ -46,12 +46,11 @@
     return GpuElementwiseBinaryCommon::validate_op(sketch, lhs, rhs, common_attributes);
 }
 
-Status GpuSub::is_supported_op(const GpuWorkloadContext &context,
-                               const ITensorInfo        *lhs,
-                               const ITensorInfo        *rhs)
+Status GpuSub::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8, DataType::S16, DataType::S32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F16, DataType::F32, DataType::U8,
+                                                         DataType::S16, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs->data_type() != rhs->data_type(), "Input tensors must be the same data type");
 
     // Set the elementwise operation to Sub then call the elementwise common is_supported_op
@@ -60,9 +59,7 @@
     return GpuElementwiseBinaryCommon::is_supported_op(context, lhs, rhs, common_attributes);
 }
 
-ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch,
-                               ITensorInfo       *lhs,
-                               ITensorInfo       *rhs)
+ITensorInfo *GpuSub::create_op(GpuWorkloadSketch &sketch, ITensorInfo *lhs, ITensorInfo *rhs)
 {
     // No need to log or validate as they'll be handled inside GpuElementwiseBinaryCommon::create_op()
     // Set the elementwise operation to Sub then call the elementwise common create_op

diff --git a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
index c00716c..bf0f274 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/GpuTanh.cpp

@@ -23,14 +23,15 @@
  */
 
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuTanh.h"
+
 #include "arm_compute/core/experimental/Types.h"
 
-#include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
-#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
-#include "src/core/helpers/AutoConfiguration.h"
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/dynamic_fusion/sketch/ArgumentPack.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
+#include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h"
 
 namespace arm_compute
 {
@@ -40,9 +41,7 @@
 {
 namespace
 {
-Status is_supported_op_helper(const GpuWorkloadContext &context,
-                              const ITensorInfo        *src,
-                              const ITensorInfo        *dst)
+Status is_supported_op_helper(const GpuWorkloadContext &context, const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
@@ -50,20 +49,21 @@
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
 
     auto_init_if_empty(dst_info_to_validate, *src->clone());
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         // Validate Activation Component
-        const auto properties = IGpuKernelComponent::Properties().stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        const auto properties =
+            IGpuKernelComponent::Properties().stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);
@@ -80,14 +80,12 @@
 constexpr GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-Status GpuTanh::is_supported_op(const GpuWorkloadContext &context,
-                                const ITensorInfo        *src)
+Status GpuTanh::is_supported_op(const GpuWorkloadContext &context, const ITensorInfo *src)
 {
     return is_supported_op_helper(context, src, nullptr);
 }
 
-Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch,
-                            const ITensorInfo       *src)
+Status GpuTanh::validate_op(const GpuWorkloadSketch &sketch, const ITensorInfo *src)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
 
@@ -112,8 +110,7 @@
     return is_supported_op_helper(*sketch.gpu_context(), src, &dst_info_to_validate);
 }
 
-ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch     &sketch,
-                                ITensorInfo           *src)
+ITensorInfo *GpuTanh::create_op(GpuWorkloadSketch &sketch, ITensorInfo *src)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src);
@@ -128,15 +125,15 @@
     // Translate into components and add to component graph
     GpuKernelComponentGraph &comp_graph = sketch.implementation().component_graph();
 
-    const ClComponentActivation::Attributes act_info{ ActivationLayerInfo::ActivationFunction::TANH };
+    const ClComponentActivation::Attributes act_info{ActivationLayerInfo::ActivationFunction::TANH};
 
     const auto *const sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         // Add Activation Component
         auto properties = IGpuKernelComponent::Properties();
-        properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+        properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
         ArgumentPack<ITensorInfo> arguments;
         arguments.add_const_tensor(ACL_SRC, src);

diff --git a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
index 7c087c9..d79a4c4 100644
--- a/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp
+++ b/src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.cpp

@@ -22,11 +22,12 @@
  * SOFTWARE.
  */
 #include "src/dynamic_fusion/sketch/gpu/operators/internal/GpuElementwiseBinaryCommon.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
-#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"
 
 namespace arm_compute
 {
@@ -38,9 +39,10 @@
 {
 void calculate_and_init_dst_if_empty(ITensorInfo *dst, const ITensorInfo *lhs, const ITensorInfo *rhs)
 {
-    if(dst->total_size() == 0U)
+    if (dst->total_size() == 0U)
     {
-        const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs);
+        const std::pair<TensorShape, ValidRegion> broadcast_pair =
+            ITensorInfo::broadcast_shape_and_valid_region(*lhs, *rhs);
         auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(broadcast_pair.first));
     }
 }
@@ -56,7 +58,7 @@
     TensorInfo         dst_info_to_validate;
     const ITensorInfo *dst_info_to_validate_ptr = &dst_info_to_validate;
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         dst_info_to_validate_ptr = dst;
     }
@@ -64,7 +66,7 @@
     calculate_and_init_dst_if_empty(&dst_info_to_validate, lhs, rhs);
 
     // Check components
-    if(context.gpu_language() == GpuLanguage::OpenCL)
+    if (context.gpu_language() == GpuLanguage::OpenCL)
     {
         const auto cl_compile_ctx = context.cl_compile_context();
         ARM_COMPUTE_RETURN_ERROR_ON(cl_compile_ctx == nullptr);
@@ -90,7 +92,8 @@
 GpuOperatorType operator_type = GpuOperatorType::Simple;
 } // namespace
 
-ElementwiseBinaryCommonAttributes &ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation)
+ElementwiseBinaryCommonAttributes &
+ElementwiseBinaryCommonAttributes::operation(const ElementwiseBinaryCommonAttributes::ElementwiseOp &operation)
 {
     _operation = operation;
     return *this;
@@ -157,14 +160,14 @@
 
     const auto sketch_ctx = sketch.implementation().context();
 
-    if(sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
+    if (sketch_ctx->gpu_language() == GpuLanguage::OpenCL)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(sketch_ctx->cl_compile_context());
 
         // Add ElementwiseBinary Component
         {
             auto properties = IGpuKernelComponent::Properties();
-            properties.stage(UnitWorkloadStage{ UnitWorkloadStage::Stage::Run });
+            properties.stage(UnitWorkloadStage{UnitWorkloadStage::Stage::Run});
 
             ArgumentPack<ITensorInfo> arguments;
             arguments.add_const_tensor(ACL_SRC_0, lhs);

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
index 0972b4e..775b0a0 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "GpuKernelVariableTable.h"
+
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
 namespace arm_compute
@@ -32,14 +34,17 @@
 {
 namespace dynamic_fusion
 {
-void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias)
+void GpuKernelVariableTable::declare_variable(const GpuKernelComponentGroup &comp_group,
+                                              const ITensorInfo             *tensor,
+                                              GpuKernelArgumentInfo          argument_info,
+                                              const std::string             &alias)
 {
     ARM_COMPUTE_ERROR_ON_MSG(!tensor->has_valid_id(), "Tensor info with valid id expected");
 
     // Do not re-declare if the variable associated with the tensor has already been declared
     auto it = _vars.find(tensor->id());
 
-    if(it != _vars.end())
+    if (it != _vars.end())
     {
         ARM_COMPUTE_ERROR_ON(!(it->second.kernel_argument_info == argument_info));
         return;
@@ -47,14 +52,12 @@
 
     const auto target = comp_group.get_tile_for_tensor(tensor);
 
-    if(target != tensor)
+    if (target != tensor)
     {
         // If the tensor uses a shared tile, don't declare another variable.
         it = _vars.find(target->id());
 
-        ARM_COMPUTE_ERROR_ON_MSG(
-            it == _vars.end(),
-            "The variable used for this tensor must have been declared.");
+        ARM_COMPUTE_ERROR_ON_MSG(it == _vars.end(), "The variable used for this tensor must have been declared.");
 
         _vars[tensor->id()] = it->second;
     }
@@ -64,7 +67,7 @@
         std::stringstream ss;
         ss << alias << "_t" << abs(tensor->id());
         const auto     uniq_name = ss.str();
-        TensorVariable var{ tensor->id(), uniq_name, argument_info };
+        TensorVariable var{tensor->id(), uniq_name, argument_info};
 
         _vars.emplace(tensor->id(), var);
     }
@@ -76,12 +79,13 @@
     return var;
 }
 
-GpuKernelVariableTable::VariableList GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
+GpuKernelVariableTable::VariableList
+GpuKernelVariableTable::get_variable_list(const std::vector<const ITensorInfo *> &tensors) const
 {
     VariableList vars{};
-    for(const auto &tensor : tensors)
+    for (const auto &tensor : tensors)
     {
-        if(!tensor->has_valid_id())
+        if (!tensor->has_valid_id())
         {
             continue;
         }
@@ -90,23 +94,19 @@
     return vars;
 }
 
-TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var)
-    : value{ var.uniq_name }
+TagVal::TagVal(const GpuKernelVariableTable::TensorVariable &var) : value{var.uniq_name}
 {
 }
 
-TagVal::TagVal(const std::string &val)
-    : value{ val }
+TagVal::TagVal(const std::string &val) : value{val}
 {
 }
 
-TagVal::TagVal(const char *val)
-    : value{ std::string(val) }
+TagVal::TagVal(const char *val) : value{std::string(val)}
 {
 }
 
-TagVal::TagVal(const DataType &data_type)
-    : value{ get_cl_type_from_data_type(data_type) }
+TagVal::TagVal(const DataType &data_type) : value{get_cl_type_from_data_type(data_type)}
 {
 }
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
index a49d38e..c17f131 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_GPUKERNELVARIABLETABLE
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelArgument.h"
 #include "support/AclRequires.h"
 #include "support/StringSupport.h"
@@ -55,11 +56,11 @@
     struct TensorVariable
     {
     public:
-        TensorVariable()                       = default;
-        TensorVariable(const TensorVariable &) = default;
+        TensorVariable()                                        = default;
+        TensorVariable(const TensorVariable &)                  = default;
         TensorVariable       &operator=(const TensorVariable &) = default;
-        ITensorInfo::Id       id{ ITensorInfo::invalid_tensor_id };
-        std::string           uniq_name{ "empty" }; // Unique name, also the final variable name used in the built code
+        ITensorInfo::Id       id{ITensorInfo::invalid_tensor_id};
+        std::string           uniq_name{"empty"}; // Unique name, also the final variable name used in the built code
         GpuKernelArgumentInfo kernel_argument_info{};
         bool                  has_valid_id() const
         {
@@ -76,7 +77,10 @@
      * @param[in] argument_info Kernel argument information
      * @param[in] alias         Alias for the variable. Will be used as part of the variable name
      */
-    void declare_variable(const GpuKernelComponentGroup &comp_group, const ITensorInfo *tensor, GpuKernelArgumentInfo argument_info, const std::string &alias = "unnamed");
+    void declare_variable(const GpuKernelComponentGroup &comp_group,
+                          const ITensorInfo             *tensor,
+                          GpuKernelArgumentInfo          argument_info,
+                          const std::string             &alias = "unnamed");
     /** Get the @ref TensorVariable associated with @p tensor
      *
      * @param[in] tensor Tensor info to be queried
@@ -106,8 +110,7 @@
     TagVal(const GpuKernelVariableTable::TensorVariable &var);
     /** Construct a @ref TagVal from an integral type */
     template <typename T, ARM_COMPUTE_REQUIRES_TA(std::is_integral<T>::value)>
-    TagVal(T val)
-        : value{ support::cpp11::to_string(val) }
+    TagVal(T val) : value{support::cpp11::to_string(val)}
     {
     }
     /** Construct a @ref TagVal from a string */

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
index 4a1fb14..9d0b4f5 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h

@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/CLCompileContext.h"
 #include "arm_compute/core/ITensorInfo.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/dynamic_fusion/sketch/ArgumentPack.h"
 #include "src/dynamic_fusion/sketch/gpu/components/Types.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
@@ -57,8 +58,7 @@
      * @param[in] id      Component id
      * @param[in] tensors Tensor arguments to the components
      */
-    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-        : _id{ id }, _tensors{ tensors }
+    IGpuTemplateComponentWriter(ComponentId id, const ArgumentPack<ITensorInfo> &tensors) : _id{id}, _tensors{tensors}
     {
     }
     /** Destructor */
@@ -112,7 +112,7 @@
     /** Generate the header list used in the component */
     virtual std::set<std::string> get_headers_list() const
     {
-        return std::set<std::string> {};
+        return std::set<std::string>{};
     }
     /** Generate the execution window for the component */
     virtual Window get_window() const
@@ -131,7 +131,7 @@
     }
 
 private:
-    ComponentId               _id{ -1 };
+    ComponentId               _id{-1};
     ArgumentPack<ITensorInfo> _tensors{};
 };
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
index 3c7c843..c165fb5 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
@@ -39,10 +40,7 @@
 ClTemplateActivation::ClTemplateActivation(ComponentId                      id,
                                            const ArgumentPack<ITensorInfo> &tensors,
                                            const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST);
@@ -62,7 +60,7 @@
     code = R"_(
 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
 )_";
-    if(is_root)
+    if (is_root)
     {
         code += R"_(
 // IN(src)              {{src}}
@@ -104,17 +102,11 @@
 
 void ClTemplateActivation::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
 
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplateActivation::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -173,7 +165,7 @@
 
 std::set<std::string> ClTemplateActivation::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h", "activation_float_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h", "activation_float_helpers.h"};
 }
 
 Window ClTemplateActivation::get_window() const

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
index ec78cf6..88ee370 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateActivation.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentActivation.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
index 4956879..0da3a73 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateCast.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
@@ -35,7 +36,7 @@
 namespace dynamic_fusion
 {
 ClTemplateCast::ClTemplateCast(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -62,7 +63,7 @@
 //------------------ START KERNEL {{meta_kernel_id}} CAST ---------------------
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         code += R"_(
 // IN_0(src)            {{src}}
@@ -82,14 +83,15 @@
     {
 )_";
 
-    if(kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
+    if (kernel_name == "cast_down" && is_data_type_quantized(_src->data_type()))
     {
         code += R"_(
     {{tmp}}[m0].v ^= (VEC_DATA_TYPE({{DATA_TYPE_IN}}, N0))0x80;
 )_";
     }
 
-    if(kernel_name == "cast_down" && (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
+    if (kernel_name == "cast_down" &&
+        (is_data_type_float(_src->data_type()) || _attributes.convert_policy() == ConvertPolicy::SATURATE))
     {
         code += R"_(
     {{dst}}[m0].v = CONVERT_SAT({{tmp}}[m0].v, VEC_DATA_TYPE({{DATA_TYPE_OUT}}, N0));
@@ -106,7 +108,7 @@
     })
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         code += R"_(
     LOOP_UNROLLING(int, i, 0, 1, M0,
@@ -128,17 +130,11 @@
 
 void ClTemplateCast::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
 
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplateCast::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -199,7 +195,7 @@
 
 std::set<std::string> ClTemplateCast::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateCast::get_window() const

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
index ab7cc9f..8380620 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.cpp

@@ -36,17 +36,17 @@
                                                      const ArgumentPack<ITensorInfo> &tensors,
                                                      const Attributes                &attributes,
                                                      const Settings                  &settings)
-    : IGpuTemplateComponentWriter{ id, tensors },
+    : IGpuTemplateComponentWriter{id, tensors},
       _src{},
       _weight{},
       _bias{},
       _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+      _attributes{attributes},
+      _settings{settings}
 {
     _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
     {
         _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
     }
@@ -71,7 +71,7 @@
 // IN_1(wei)            {{weight}}
 )_";
 
-    if(_bias != nullptr && _bias->has_valid_id())
+    if (_bias != nullptr && _bias->has_valid_id())
     {
         code += R"_(
 // IN_1(bia)            {{bias}}
@@ -113,7 +113,7 @@
     })
 )_";
 
-    if(_weight->dimension(height_idx) < 5)
+    if (_weight->dimension(height_idx) < 5)
     {
         code += R"_(
     LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
@@ -147,7 +147,7 @@
             {
 )_";
 
-    if(!_settings.is_fma_available())
+    if (!_settings.is_fma_available())
     {
         code += R"_(
                 {{dst}}[m0].v += a[xk + m0].v * b[xk].v;
@@ -166,14 +166,14 @@
     }
 )_";
 
-    if(_weight->dimension(height_idx) < 5)
+    if (_weight->dimension(height_idx) < 5)
     {
         code += R"_(
     )
 )_";
     }
 
-    if(_bias && _bias->has_valid_id())
+    if (_bias && _bias->has_valid_id())
     {
         code += R"_(
         TILE({{BIA_DATA_TYPE}}, 1, N0, {{bias}});
@@ -198,44 +198,31 @@
     return code;
 }
 
-void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+void ClTemplateDepthwiseConv2d::declare_variables(GpuKernelVariableTable &vtable,
+                                                  const ComponentGroup   &comp_group) const
 {
-    const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image() ?
-                                                       GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
-                                                       GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    const GpuKernelArgumentInfo::Type input_type = _settings.export_input_to_cl_image()
+                                                       ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
+                                                       : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
 
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(input_type),
-        "src");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(input_type), "src");
 
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image() ?
-                                                        GpuKernelArgumentInfo::Type::Tensor_4D_t_Image :
-                                                        GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_weights_to_cl_image()
+                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
+                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
 
-    vtable.declare_variable(
-        comp_group,
-        _weight,
-        GpuKernelArgumentInfo(weight_type),
-        "weight");
+    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
 
-    if(_bias != nullptr && _bias->has_valid_id()) // optional bias
+    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
     {
-        vtable.declare_variable(
-            comp_group,
-            _bias,
-            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
-            "bias");
+        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
     }
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
-TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+TagLUT ClTemplateDepthwiseConv2d::get_tag_lut(const GpuKernelVariableTable &vtable,
+                                              const ComponentGroup         &comp_group) const
 {
     TagLUT lut{};
 
@@ -243,7 +230,7 @@
     lut["src"]    = vtable.get_variable(_src);
     lut["weight"] = vtable.get_variable(_weight);
 
-    if(_bias != nullptr && _bias->has_valid_id()) // optional bias
+    if (_bias != nullptr && _bias->has_valid_id()) // optional bias
     {
         lut["bias"]          = vtable.get_variable(_bias);
         lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
@@ -259,7 +246,7 @@
     lut["SRC_DATA_TYPE"]  = _src->data_type();
     lut["WEI_DATA_TYPE"]  = _weight->data_type();
 
-    switch(vtable.get_variable(_src).kernel_argument_info.type)
+    switch (vtable.get_variable(_src).kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
@@ -271,7 +258,7 @@
             break;
     }
 
-    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    switch (vtable.get_variable(_weight).kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
@@ -318,7 +305,7 @@
 
     CLBuildOptions build_opts{};
 
-    if(_settings.fast_relaxed_math())
+    if (_settings.fast_relaxed_math())
     {
         build_opts.add_option("-cl-fast-relaxed-math");
     }
@@ -361,7 +348,7 @@
 
 std::set<std::string> ClTemplateDepthwiseConv2d::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateDepthwiseConv2d::get_window() const

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
index 84b689e..5d04c68 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDepthwiseConv2d.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEDEPTHWISECONV2D
 
 #include "arm_compute/dynamic_fusion/sketch/attributes/DepthwiseConv2dAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDepthwiseConv2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
index 3322487..f6a7a58 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.cpp

@@ -23,14 +23,13 @@
  */
 #include "ClTemplateDirectConv2d.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -43,17 +42,17 @@
                                                const ArgumentPack<ITensorInfo> &tensors,
                                                const Attributes                &attributes,
                                                const Settings                  &settings)
-    : IGpuTemplateComponentWriter{ id, tensors },
+    : IGpuTemplateComponentWriter{id, tensors},
       _src{},
       _weight{},
       _bias{},
       _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+      _attributes{attributes},
+      _settings{settings}
 {
     _src    = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _weight = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
-    if(this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
+    if (this->tensors().get_const_tensor(TensorType::ACL_SRC_2))
     {
         _bias = this->tensors().get_const_tensor(TensorType::ACL_SRC_2);
     }
@@ -79,7 +78,7 @@
 // IN_0(src)            {{src}}
 // IN_1(wei)            {{weight}}
 )_";
-    if(_bias && _bias->has_valid_id())
+    if (_bias && _bias->has_valid_id())
     {
         code += R"_(
 // IN_1(bia)            {{bias}}
@@ -161,7 +160,7 @@
         }
 )_";
 
-    if(leftover_loop)
+    if (leftover_loop)
     {
         code += R"_(
         for(; ck < _ISRC_CHANNELS; ++ck)
@@ -186,9 +185,9 @@
             T_MMUL({{SRC_DATA_TYPE}}, {{WEI_DATA_TYPE}}, {{ACC_DATA_TYPE}}, M0, N0, 1, NT, T, a, b, {{dst}});
         }
     )_";
-}
+    }
 
-code += R"_(
+    code += R"_(
 #undef _I_WEI_WIDTH
 #undef _I_WEI_HEIGHT
 #undef _ISRC_WIDTH
@@ -202,7 +201,7 @@
     }
 )_";
 
-    if(_bias && _bias->has_valid_id())
+    if (_bias && _bias->has_valid_id())
     {
         code += R"_(
         TILE({{BIA_DATA_TYPE}}, 1, N0, bias0);
@@ -211,9 +210,9 @@
 
         T_ELTWISE_BROADCAST_ADD_X({{ACC_DATA_TYPE}}, M0, N0, {{dst}}, bias0, {{dst}});
     )_";
-}
+    }
 
-code += R"_(
+    code += R"_(
     LOOP_UNROLLING(int, i, 0, 1, M0,
     {
         g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{DST_WIDTH}} * {{DST_HEIGHT}}) - 1);
@@ -227,32 +226,19 @@
 
 void ClTemplateDirectConv2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
 
-    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image() ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
-    vtable.declare_variable(
-        comp_group,
-        _weight,
-        GpuKernelArgumentInfo(weight_type),
-        "weight");
+    const GpuKernelArgumentInfo::Type weight_type = _settings.export_to_cl_image()
+                                                        ? GpuKernelArgumentInfo::Type::Tensor_4D_t_Image
+                                                        : GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer;
+    vtable.declare_variable(comp_group, _weight, GpuKernelArgumentInfo(weight_type), "weight");
 
-    if(_bias && _bias->has_valid_id()) // optional bias
+    if (_bias && _bias->has_valid_id()) // optional bias
     {
-        vtable.declare_variable(
-            comp_group,
-            _bias,
-            GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector),
-            "bias");
+        vtable.declare_variable(comp_group, _bias, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Vector), "bias");
     }
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
 }
 
 TagLUT ClTemplateDirectConv2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -262,7 +248,7 @@
     lut["src"]    = vtable.get_variable(_src);
     lut["weight"] = vtable.get_variable(_weight);
 
-    if(_bias && _bias->has_valid_id()) // optional bias
+    if (_bias && _bias->has_valid_id()) // optional bias
     {
         lut["bias"]          = vtable.get_variable(_bias);
         lut["BIA_DATA_TYPE"] = get_cl_type_from_data_type(_bias->data_type());
@@ -279,34 +265,34 @@
     lut["WEI_DATA_TYPE"]  = _weight->data_type();
 
     lut["SRC_TENSOR_TYPE"] = "BUFFER";
-    switch(vtable.get_variable(_weight).kernel_argument_info.type)
+    switch (vtable.get_variable(_weight).kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Image_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Image_3D_Export_To_ClImage2D:
         case GpuKernelArgumentInfo::Type::Tensor_4D_t_Image:
-    {
-        lut["WEI_TENSOR_TYPE"] = "IMAGE";
-        break;
-    }
+        {
+            lut["WEI_TENSOR_TYPE"] = "IMAGE";
+            break;
+        }
         default:
-    {
-        lut["WEI_TENSOR_TYPE"] = "BUFFER";
-        break;
+        {
+            lut["WEI_TENSOR_TYPE"] = "BUFFER";
+            break;
+        }
     }
-    }
-    const auto width_idx  = 1;
-    const auto height_idx = 2;
+    const auto width_idx   = 1;
+    const auto height_idx  = 2;
     const auto channel_idx = 0;
 
-    lut["SRC_WIDTH"] = _src->dimension(width_idx);
-    lut["SRC_HEIGHT"] = _src->dimension(height_idx);
+    lut["SRC_WIDTH"]    = _src->dimension(width_idx);
+    lut["SRC_HEIGHT"]   = _src->dimension(height_idx);
     lut["SRC_CHANNELS"] = _src->dimension(channel_idx);
 
-    lut["WEI_WIDTH"]      = _weight->dimension(width_idx);
-    lut["WEI_HEIGHT"]     = _weight->dimension(height_idx);
+    lut["WEI_WIDTH"]  = _weight->dimension(width_idx);
+    lut["WEI_HEIGHT"] = _weight->dimension(height_idx);
 
-    lut["DST_WIDTH"] = _dst->dimension(width_idx);
-    lut["DST_HEIGHT"] = _dst->dimension(height_idx);
+    lut["DST_WIDTH"]    = _dst->dimension(width_idx);
+    lut["DST_HEIGHT"]   = _dst->dimension(height_idx);
     lut["DST_CHANNELS"] = _dst->dimension(channel_idx);
 
     lut["STRIDE_X"] = _attributes.stride().x();
@@ -324,14 +310,14 @@
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(_src->data_layout(), DataLayoutDimension::CHANNEL);
 
-    const auto         root_window      = comp_group.get_root_component()->template_writer()->get_window();
-    const unsigned int n0               = root_window.x().step();
-    const unsigned int m0               = root_window.y().step();
-    const unsigned int k0               = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
+    const auto         root_window = comp_group.get_root_component()->template_writer()->get_window();
+    const unsigned int n0          = root_window.x().step();
+    const unsigned int m0          = root_window.y().step();
+    const unsigned int k0 = adjust_vec_size(_settings.direct_conv_descriptor().k0, _src->dimension(channel_idx));
     const unsigned int partial_store_n0 = _dst->dimension(0) % n0;
 
     CLBuildOptions build_opts{};
-    if(_settings.fast_relaxed_math())
+    if (_settings.fast_relaxed_math())
     {
         build_opts.add_option("-cl-fast-relaxed-math");
     }
@@ -379,7 +365,7 @@
 
 std::set<std::string> ClTemplateDirectConv2d::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateDirectConv2d::get_window() const

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
index 8988d3c..03c8cd2 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateDirectConv2d.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Conv2dAttributes.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
index c0481ae..78bff3c 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.cpp

@@ -23,14 +23,13 @@
  */
 #include "ClTemplateElementwiseBinary.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
-
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -44,11 +43,7 @@
 ClTemplateElementwiseBinary::ClTemplateElementwiseBinary(ComponentId                      id,
                                                          const ArgumentPack<ITensorInfo> &tensors,
                                                          const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _lhs{},
-      _rhs{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _lhs{}, _rhs{}, _dst{}, _attributes{attributes}
 {
     _lhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _rhs = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -69,67 +64,67 @@
     const bool  is_rhs_input = comp_group.is_input_tensor(_rhs);
 
     code =
-R"_(
+        R"_(
     //------------------ START KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         code +=
-R"_(
+            R"_(
     TILE(uint, M0, 1, g_dst_indirect_y);
 )_";
     }
 
-    if(is_lhs_input)
+    if (is_lhs_input)
     {
         code +=
-R"_(
+            R"_(
     TILE({{DATA_TYPE}}, {{lhs_m0}}, N0, {{lhs}});
 )_";
     }
 
-    if(is_rhs_input)
+    if (is_rhs_input)
     {
         code +=
-R"_(
+            R"_(
     TILE({{DATA_TYPE}}, {{rhs_m0}}, N0, {{rhs}});
 )_";
     }
 
     code +=
-R"_(
+        R"_(
     {
 )_";
 
-    if(is_lhs_input)
+    if (is_lhs_input)
     {
         code +=
-R"_(
+            R"_(
         {{lhs}}_offset_first_element_in_bytes += g_ind_2 * {{lhs}}_stride_w;
         T_LOAD({{DATA_TYPE}}, {{lhs_m0}}, {{lhs_n0}}, BUFFER, {{lhs}}, {{lhs_start_ind_0}}, {{lhs_start_ind_1}}, 1, {{lhs}}_stride_y, {{lhs}});
 )_";
     }
 
-    if(is_rhs_input)
+    if (is_rhs_input)
     {
         code +=
-R"_(
+            R"_(
         {{rhs}}_offset_first_element_in_bytes += g_ind_2 * {{rhs}}_stride_w;
         T_LOAD({{DATA_TYPE}}, {{rhs_m0}}, {{rhs_n0}}, BUFFER, {{rhs}}, {{rhs_start_ind_0}}, {{rhs_start_ind_1}}, 1, {{rhs}}_stride_y, {{rhs}});
 )_";
     }
 
     code +=
-R"_(
+        R"_(
         T_ELTWISE_{{BROADCAST_OP}}{{ELTWISE_OP}}({{DATA_TYPE}}, M0, N0, {{lhs}}, {{rhs}}, {{dst}});
 )_";
 
-    if(is_root)
+    if (is_root)
     {
         // Calculate the destination indirect Y
         code +=
-R"_(
+            R"_(
         LOOP_UNROLLING(int, i, 0, 1, M0,
         {
             g_dst_indirect_y[i].v = (uint)min(g_ind_1 + i, (int)({{arg_dst}}_w * {{arg_dst}}_h) - 1);
@@ -139,7 +134,7 @@
     }
 
     code +=
-R"_(
+        R"_(
     }
     //------------------ END KERNEL {{meta_kernel_id}} {{ELTWISE_OP}} ---------------------
 )_";
@@ -147,28 +142,18 @@
     return code;
 }
 
-void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+void ClTemplateElementwiseBinary::declare_variables(GpuKernelVariableTable &vtable,
+                                                    const ComponentGroup   &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _lhs,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "lhs");
+    vtable.declare_variable(comp_group, _lhs, GpuKernelArgumentInfo(common_tensor_type), "lhs");
 
-    vtable.declare_variable(
-        comp_group,
-        _rhs,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "rhs");
+    vtable.declare_variable(comp_group, _rhs, GpuKernelArgumentInfo(common_tensor_type), "rhs");
 
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
 }
 
-TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+TagLUT ClTemplateElementwiseBinary::get_tag_lut(const GpuKernelVariableTable &vtable,
+                                                const ComponentGroup         &comp_group) const
 {
     TagLUT lut{};
 
@@ -182,7 +167,7 @@
     lut["dst"]     = vtable.get_variable(_dst);
     lut["arg_dst"] = vtable.get_variable(comp_group.get_any_dst_tensor());
 
-    switch(_attributes.operation())
+    switch (_attributes.operation())
     {
         case Attributes::ElementwiseOp::Add:
             lut["ELTWISE_OP"] = "ADD";
@@ -197,10 +182,10 @@
             ARM_COMPUTE_ERROR("Arithmetic Operation not supported");
     }
 
-    ARM_COMPUTE_ERROR_ON(
-        comp_group.is_intermediate_tensor(_lhs) && detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
-    ARM_COMPUTE_ERROR_ON(
-        comp_group.is_intermediate_tensor(_rhs) && detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
+    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_lhs) &&
+                         detail::have_different_dimensions(_lhs->tensor_shape(), _dst->tensor_shape(), 0));
+    ARM_COMPUTE_ERROR_ON(comp_group.is_intermediate_tensor(_rhs) &&
+                         detail::have_different_dimensions(_rhs->tensor_shape(), _dst->tensor_shape(), 0));
 
     // Set broadcast parameters
     // PRE: All tensors are broadcast-compatible
@@ -228,9 +213,7 @@
     lut["rhs_m0"]          = (rhs_broadcast_yz) ? "1" : "M0";
     lut["rhs_start_ind_1"] = (rhs_broadcast_yz) ? "0" : "g_ind_1";
 
-    lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" :
-                          (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" :
-                                               "";
+    lut["BROADCAST_OP"] = (lhs_broadcast_yz) ? "BROADCAST_LHS_X_" : (rhs_broadcast_yz) ? "BROADCAST_RHS_X_" : "";
 
     return lut;
 }
@@ -268,7 +251,7 @@
 
 std::set<std::string> ClTemplateElementwiseBinary::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateElementwiseBinary::get_window() const
@@ -279,8 +262,9 @@
     // Collapse Dim 1 (W) and Dim 2 (H) together, leave Dim 0 (C) and upper dimensions unchanged
     // This is in line with the collapsing convention used by operators like Conv2d
     output_shape.collapse(2U, 1U);
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
-    Window             win                               = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / _dst->element_size(), _dst->dimension(0));
+    Window win = calculate_max_window(output_shape, Steps(num_elems_processed_per_iteration));
 
     return win;
 }

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
index 8cca954..991c0ec 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateElementwiseBinary.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATEELEMENTWISEBINARY
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentElementwiseBinary.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
@@ -48,9 +49,7 @@
      * @param[in] tensors    Tensor arguments to the components
      * @param[in] attributes Component attributes
      */
-    ClTemplateElementwiseBinary(ComponentId                      id,
-                                const ArgumentPack<ITensorInfo> &tensors,
-                                const Attributes                &attributes);
+    ClTemplateElementwiseBinary(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
     /** Prevent instances of this class from being copy constructed */
     ClTemplateElementwiseBinary(const ClTemplateElementwiseBinary &elementwise) = delete;
     /** Prevent instances of this class from being copied */

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
index a8d8d32..522c33a 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.cpp

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
@@ -38,16 +39,12 @@
 {
 namespace
 {
-    constexpr unsigned int serial_vector_size = 8;
+constexpr unsigned int serial_vector_size = 8;
 } // namespace
 ClTemplateLogits1DMaxShiftExpSum::ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
                                                                    const ArgumentPack<ITensorInfo> &tensors,
                                                                    const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _sum{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _sum = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -79,7 +76,7 @@
 
     const bool beta_defined = (_attributes.beta() != 1.f);
 
-    if(beta_defined)
+    if (beta_defined)
     {
         code += R"_(
     VEC_TYPE beta = (VEC_TYPE){{BETA}};
@@ -91,7 +88,7 @@
     const unsigned int     vector_size         = adjust_vec_size(_serial_vector_size, reduction_dim_size);
     const bool             non_multiple_of_n0  = ((reduction_dim_size % vector_size) != 0);
 
-    if(non_multiple_of_n0)
+    if (non_multiple_of_n0)
     {
         code += R"_(
     VEC_TYPE data    = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
@@ -111,19 +108,19 @@
     VEC_TYPE sum1D = 0;
 )_";
 
-    if(non_multiple_of_n0)
+    if (non_multiple_of_n0)
     {
         code += R"_(
     data -= max_val;
 )_";
-        if(beta_defined)
+        if (beta_defined)
         {
             code += R"_(
     data *= beta;
 )_";
         }
 
-        if(_attributes.is_log_softmax())
+        if (_attributes.is_log_softmax())
         {
             code += R"_(
     VSTORE_PARTIAL(N0, PARTIAL_N0)
@@ -153,14 +150,14 @@
         data -= max_val;
 )_";
 
-    if(beta_defined)
+    if (beta_defined)
     {
         code += R"_(
     data *= beta;
 )_";
     }
 
-    if(_attributes.is_log_softmax())
+    if (_attributes.is_log_softmax())
     {
         code += R"_(
     VSTORE(N0)
@@ -191,28 +188,18 @@
     return code;
 }
 
-void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+void ClTemplateLogits1DMaxShiftExpSum::declare_variables(GpuKernelVariableTable &vtable,
+                                                         const ComponentGroup   &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "src");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
 
-    vtable.declare_variable(
-        comp_group,
-        _sum,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "sum");
+    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
 
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
 }
 
-TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
+TagLUT ClTemplateLogits1DMaxShiftExpSum::get_tag_lut(const GpuKernelVariableTable &vtable,
+                                                     const ComponentGroup         &comp_group) const
 {
     ARM_COMPUTE_UNUSED(comp_group);
 
@@ -241,8 +228,8 @@
     ARM_COMPUTE_UNUSED(comp_group);
     CLBuildOptions build_opts{};
 
-    const unsigned int     reduction_dim_size = _src->dimension(0);
-    const unsigned int     vector_size        = adjust_vec_size(serial_vector_size, reduction_dim_size);
+    const unsigned int reduction_dim_size = _src->dimension(0);
+    const unsigned int vector_size        = adjust_vec_size(serial_vector_size, reduction_dim_size);
 
     build_opts.add_option("-DN0=" + support::cpp11::to_string(vector_size));
     build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string((reduction_dim_size % vector_size)));
@@ -264,7 +251,7 @@
 
 std::set<std::string> ClTemplateLogits1DMaxShiftExpSum::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateLogits1DMaxShiftExpSum::get_window() const

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
index 5d232c0..ac9ddaa 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DMaxShiftExpSum.h

@@ -46,7 +46,9 @@
      * @param[in] tensors    Tensor arguments to the components
      * @param[in] attributes Component attributes
      */
-    ClTemplateLogits1DMaxShiftExpSum(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const Attributes &attributes);
+    ClTemplateLogits1DMaxShiftExpSum(ComponentId                      id,
+                                     const ArgumentPack<ITensorInfo> &tensors,
+                                     const Attributes                &attributes);
     /** Prevent instances of this class from being copy constructed */
     ClTemplateLogits1DMaxShiftExpSum(const ClTemplateLogits1DMaxShiftExpSum &) = delete;
     /** Prevent instances of this class from being copied */

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
index 056e570..7d7c3e6 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.cpp

@@ -25,6 +25,7 @@
 #include "src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateLogits1DNorm.h"
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
@@ -38,11 +39,7 @@
 ClTemplateLogits1DNorm::ClTemplateLogits1DNorm(ComponentId                      id,
                                                const ArgumentPack<ITensorInfo> &tensors,
                                                const Attributes                &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _sum{},
-      _dst{},
-      _attributes{ attributes }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _sum{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _sum = this->tensors().get_const_tensor(TensorType::ACL_SRC_1);
@@ -76,7 +73,7 @@
     data0 = VLOAD(N0)(0, (__global {{DATA_TYPE}} *)src_addr);
 )_";
 
-    if(_attributes.is_log_softmax())
+    if (_attributes.is_log_softmax())
     {
         code += R"_(
     sum_val = log(sum_val);
@@ -101,23 +98,11 @@
 
 void ClTemplateLogits1DNorm::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "src");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "src");
 
-    vtable.declare_variable(
-        comp_group,
-        _sum,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "sum");
+    vtable.declare_variable(comp_group, _sum, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "sum");
 
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_3D), "dst");
 }
 
 TagLUT ClTemplateLogits1DNorm::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -168,14 +153,14 @@
 
 std::set<std::string> ClTemplateLogits1DNorm::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateLogits1DNorm::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
     constexpr unsigned int serial_vector_size = 16;
-    const unsigned int vector_size = adjust_vec_size(serial_vector_size, _src->dimension(0));
+    const unsigned int     vector_size        = adjust_vec_size(serial_vector_size, _src->dimension(0));
 
     Window win = calculate_max_window(*_src, Steps(vector_size));
     return win.collapse(win, Window::DimZ);

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
index 34840c2..ebb0374 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.cpp

@@ -23,14 +23,13 @@
  */
 #include "ClTemplatePool2d.h"
 
-#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
-#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
-#include "src/core/helpers/WindowHelpers.h"
 
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentDirectConv2d.h"
+#include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -50,11 +49,7 @@
                                    const ArgumentPack<ITensorInfo> &tensors,
                                    const Attributes                &attributes,
                                    const Settings                  &settings)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _dst{},
-      _attributes{ attributes },
-      _settings{ settings }
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}, _settings{settings}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -71,7 +66,7 @@
     ARM_COMPUTE_UNUSED(comp_group);
 
     // Condition to use 2x2 optimized kernel
-    if(_attributes.pool_size() == Size2D(2, 2))
+    if (_attributes.pool_size() == Size2D(2, 2))
     {
         return get_2x2_kernel_code();
     }
@@ -83,11 +78,13 @@
 
 std::string ClTemplatePool2d::get_MxN_kernel_code() const
 {
-    const auto pool_type          = _attributes.pool_type();
-    const bool fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+    const auto pool_type = _attributes.pool_type();
+    const bool fp_mixed_precision =
+        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
 
     // Define pool op macro.
-    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
 
     // Kernel start
     // Note: If C is not multiple of N0, we shift back of PARTIAL_N0 elements to compute the leftover elements for get_global_id(0) == 0
@@ -129,7 +126,7 @@
 )_";
 
     // Determine filter size depending on if padding is excluded or not
-    if(_attributes.exclude_padding())
+    if (_attributes.exclude_padding())
     {
         code += R"_(
     const int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
@@ -144,7 +141,8 @@
 
     // Loop through pool size
     // if global pooling
-    if(_attributes.pool_size().x() == _src->dimension(width_idx) && _attributes.pool_size().y() == _src->dimension(height_idx))
+    if (_attributes.pool_size().x() == _src->dimension(width_idx) &&
+        _attributes.pool_size().y() == _src->dimension(height_idx))
     {
         // Begin loop
         code += R"_(
@@ -173,7 +171,7 @@
 
     // if condition inside loop - use 32bit acc if mixed_precision.
     // End loop through pooling section.
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
         code += R"_(
@@ -194,7 +192,7 @@
     }
 
     // For Pool AVG ONLY, divide pool output by filter size
-    if(pool_type == PoolingType::AVG)
+    if (pool_type == PoolingType::AVG)
     {
         code += R"_(
     res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
@@ -202,7 +200,7 @@
     }
 
     // If mixed precision convert datatype before storing. Then end kernel.
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         code += R"_(
     VEC_DATA_TYPE({{DATA_TYPE}}, N0)
@@ -228,9 +226,11 @@
 
 std::string ClTemplatePool2d::get_2x2_kernel_code() const
 {
-    const auto  pool_type          = _attributes.pool_type();
-    const bool  fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
-    std::string pool_op            = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_" : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
+    const auto pool_type = _attributes.pool_type();
+    const bool fp_mixed_precision =
+        (_src->data_type() == DataType::F16) && _settings.mixed_precision() && pool_type != PoolingType::MAX;
+    std::string pool_op = (pool_type == PoolingType::AVG) ? R"_(#define POOL_OP(x,y) ((x) + (y)))_"
+                                                          : R"_(#define POOL_OP(x,y) (fmax((x), (y))) )_";
 
     std::string code = R"_(
 //------------------ START KERNEL {{meta_kernel_id}} ---------------------
@@ -274,7 +274,7 @@
     REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0), data, 0);
 )_";
 
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
         code += R"_(
@@ -294,7 +294,7 @@
 )_";
     }
 
-    if(pool_type != PoolingType::MAX)
+    if (pool_type != PoolingType::MAX)
     {
         // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
         code += R"_(
@@ -321,10 +321,10 @@
     res0 = POOL_OP(res0, data3);
 )_";
 
-    if(pool_type == PoolingType::AVG)
+    if (pool_type == PoolingType::AVG)
     {
         // If avg pooling divide result accordingly.
-        if(_attributes.exclude_padding())
+        if (_attributes.exclude_padding())
         {
             code += R"_(
     res0 /= (VEC_DATA_TYPE({{ACC_DATA_TYPE}}, N0))filter_size;
@@ -339,7 +339,7 @@
     }
 
     // Store result
-    if(fp_mixed_precision)
+    if (fp_mixed_precision)
     {
         code += R"_(
     VEC_DATA_TYPE({{DATA_TYPE}}, N0)
@@ -365,17 +365,11 @@
 
 void ClTemplatePool2d::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
 
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplatePool2d::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
@@ -391,12 +385,15 @@
     lut["meta_kernel_id"] = id();
 
     // Retrieve relevant data
-    const auto        padding                = _attributes.pad();
-    const auto        stride                 = _attributes.stride();
-    const auto        pool_size              = _attributes.pool_size();
-    const auto        data_type              = _src->data_type();
-    const auto        use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() && _attributes.pool_type() != PoolingType::MAX;
-    const std::string max_initial_value      = _settings.use_inf_as_limit() ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+    const auto padding                = _attributes.pad();
+    const auto stride                 = _attributes.stride();
+    const auto pool_size              = _attributes.pool_size();
+    const auto data_type              = _src->data_type();
+    const auto use_fp_mixed_precision = (_src->data_type() == DataType::F16) && _settings.mixed_precision() &&
+                                        _attributes.pool_type() != PoolingType::MAX;
+    const std::string max_initial_value =
+        _settings.use_inf_as_limit() ? "(-INFINITY)"
+                                     : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
 
     // pool specific
     lut["STRIDE_X"]    = stride.x();
@@ -407,7 +404,8 @@
     lut["POOL_SIZE_Y"] = pool_size.height;
 
     // Datatypes and variables
-    lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type((use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
+    lut["ACC_DATA_TYPE"] = get_cl_type_from_data_type(
+        (use_fp_mixed_precision) ? (DataType::F32) : (data_type)); // Type of accumulators to use.
     lut["DATA_TYPE"]     = get_cl_type_from_data_type(data_type);
     lut["SRC_WIDTH"]     = _src->dimension(width_idx);
     lut["SRC_HEIGHT"]    = _src->dimension(height_idx);
@@ -454,14 +452,14 @@
 
 std::set<std::string> ClTemplatePool2d::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h", "repeat.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h", "repeat.h"};
 }
 
 Window ClTemplatePool2d::get_window() const
 {
     ARM_COMPUTE_ERROR_ON_MSG(_dst->tensor_shape().total_size() == 0U, "Destination tensor is not initialized");
     const auto         output_shape = _dst->tensor_shape();
-    const unsigned int vec_size     = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
+    const unsigned int vec_size = adjust_vec_size(((_dst->data_type() == DataType::F32) ? 2 : 4), _dst->dimension(0));
 
     // Create and configure kernel window
     auto win = calculate_max_window(output_shape, Steps(vec_size));

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
index ef1c100..d1d3c01 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplatePool2d.h

@@ -27,6 +27,7 @@
 #include "arm_compute/core/experimental/Types.h"
 #include "arm_compute/dynamic_fusion/sketch/attributes/Pool2dAttributes.h"
 #include "arm_compute/dynamic_fusion/sketch/gpu/operators/GpuPool2d.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentPool2d.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
index 8b50f1e..c882353 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
 
@@ -36,11 +37,8 @@
 {
 constexpr unsigned int vector_size_byte_opencl = 16;
 
-ClTemplateReshape::ClTemplateReshape(ComponentId                      id,
-                                     const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{ id, tensors },
-      _src{},
-      _dst{}
+ClTemplateReshape::ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -97,23 +95,17 @@
 
 void ClTemplateReshape::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D
-        "src");
+    vtable.declare_variable(comp_group, _src,
+                            GpuKernelArgumentInfo(common_tensor_type), // GpuKernelArgumentInfo::Type::Image_3D
+                            "src");
 
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(common_tensor_type),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(common_tensor_type), "dst");
 }
 
 TagLUT ClTemplateReshape::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
     ARM_COMPUTE_UNUSED(comp_group);
-    TagLUT     lut{};
+    TagLUT lut{};
 
     // Arguments and global shared variables
     lut["src"]            = vtable.get_variable(_src);
@@ -153,7 +145,7 @@
 
 std::set<std::string> ClTemplateReshape::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateReshape::get_window() const

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
index 56b6585..838a21d 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateReshape.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATERESHAPE
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/cl/ClComponentReshape.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
@@ -42,8 +43,7 @@
      * @param[in] id      Component id
      * @param[in] tensors Tensor arguments to the components
      */
-    ClTemplateReshape(ComponentId                      id,
-                      const ArgumentPack<ITensorInfo> &tensors);
+    ClTemplateReshape(ComponentId id, const ArgumentPack<ITensorInfo> &tensors);
     /** Prevent instances of this class from being copy constructed */
     ClTemplateReshape(const ClTemplateReshape &reshape) = delete;
     /** Prevent instances of this class from being copied */

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
index aaed1d9..846c712 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateResize.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/dynamic_fusion/sketch/gpu/GpuKernelComponentGroup.h"
@@ -37,8 +38,10 @@
 {
 namespace dynamic_fusion
 {
-ClTemplateResize::ClTemplateResize(ComponentId id, const ArgumentPack<ITensorInfo> &tensors, const ClTemplateResize::Attributes &attributes)
-    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}, _attributes{ attributes }
+ClTemplateResize::ClTemplateResize(ComponentId                         id,
+                                   const ArgumentPack<ITensorInfo>    &tensors,
+                                   const ClTemplateResize::Attributes &attributes)
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}, _attributes{attributes}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -63,9 +66,9 @@
     const int bout = g_ind_2 / {{arg_dst}}_h;
 )_";
 
-    if(_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR)
+    if (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR)
     {
-        if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
+        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
         {
             code += R"_(
     float xi_f = (g_ind_1 * {{SCALE_X}});
@@ -80,7 +83,7 @@
 )_";
         }
 
-        if(_attributes.align_corners())
+        if (_attributes.align_corners())
         {
             code += R"_(
     xi_f = round(xi_f);
@@ -95,9 +98,9 @@
     T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi0, xi0, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, {{dst}});
 )_";
     }
-    else if(_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR)
+    else if (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR)
     {
-        if(_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
+        if (_attributes.sampling_policy() == SamplingPolicy::TOP_LEFT)
         {
             code += R"_(
     float xi_f = (g_ind_1 * {{SCALE_X}});
@@ -137,7 +140,7 @@
     T_LOAD_NHWC_WITH_DILATION({{SRC_DATA_TYPE}}, 1, 1, N0, {{SRC_TENSOR_TYPE}}, {{src}}, bout, yi1, xi1, g_ind_0, {{src}}_w, {{src}}_h, 1, 1, false, in11);
 )_";
 
-        if(is_data_type_float(_src->data_type()))
+        if (is_data_type_float(_src->data_type()))
         {
             code += R"_(
     const {{SRC_DATA_TYPE}} a  = ({{SRC_DATA_TYPE}})(xi_f - (float)xi);
@@ -158,9 +161,9 @@
     const float b1 = (1.f - a1);
 
     {{dst}}[0].v = CONVERT_SAT(
-        (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) + 
+        (CONVERT(in00[0].v, VEC_DATA_TYPE(float, N0)) * b * b1) +
         (CONVERT(in01[0].v, VEC_DATA_TYPE(float, N0)) * a * b1) +
-        (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) + 
+        (CONVERT(in10[0].v, VEC_DATA_TYPE(float, N0)) * b * a1) +
         (CONVERT(in11[0].v, VEC_DATA_TYPE(float, N0)) * a * a1), VEC_DATA_TYPE({{DST_DATA_TYPE}}, N0));
 )_";
         }
@@ -179,22 +182,18 @@
     return code;
 }
 
-void ClTemplateResize::declare_variables(GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
+void ClTemplateResize::declare_variables(GpuKernelVariableTable                            &vtable,
+                                         const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
 
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
-TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable &vtable, const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
+TagLUT ClTemplateResize::get_tag_lut(const GpuKernelVariableTable                      &vtable,
+                                     const IGpuTemplateComponentWriter::ComponentGroup &comp_group) const
 {
     TagLUT lut{};
 
@@ -212,8 +211,10 @@
     lut["DST_DATA_TYPE"]   = get_cl_type_from_data_type(_dst->data_type());
     lut["CONSTANT_VALUE"]  = string_from_pixel_value(0, _src->data_type());
 
-    const float scale_x = scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
-    const float scale_y = scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
+    const float scale_x =
+        scale_utils::calculate_resize_ratio(_src->dimension(1), _dst->dimension(1), _attributes.align_corners());
+    const float scale_y =
+        scale_utils::calculate_resize_ratio(_src->dimension(2), _dst->dimension(2), _attributes.align_corners());
 
     lut["SCALE_X"] = float_to_string_with_full_precision(scale_x);
     lut["SCALE_Y"] = float_to_string_with_full_precision(scale_y);
@@ -242,7 +243,8 @@
     std::string config_id{};
 
     config_id += "resize_";
-    config_id += (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : "");
+    config_id +=
+        (_attributes.interpolation_policy() == InterpolationPolicy::NEAREST_NEIGHBOR ? "NEAREST_NEIGHBOR" : "");
     config_id += (_attributes.interpolation_policy() == InterpolationPolicy::BILINEAR ? "BILINEAR" : "");
     config_id += "_";
     config_id += (_attributes.sampling_policy() == SamplingPolicy::CENTER ? "center" : "topleft");
@@ -260,7 +262,7 @@
 
 std::set<std::string> ClTemplateResize::get_headers_list() const
 {
-    return std::set<std::string>{ "helpers.h", "tile_helpers.h" };
+    return std::set<std::string>{"helpers.h", "tile_helpers.h"};
 }
 
 Window ClTemplateResize::get_window() const

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
index 217214c..d0ec91e 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.cpp

@@ -32,7 +32,7 @@
 namespace dynamic_fusion
 {
 ClTemplateStore::ClTemplateStore(ComponentId id, const ArgumentPack<ITensorInfo> &tensors)
-    : IGpuTemplateComponentWriter{ id, tensors }, _src{}, _dst{}
+    : IGpuTemplateComponentWriter{id, tensors}, _src{}, _dst{}
 {
     _src = this->tensors().get_const_tensor(TensorType::ACL_SRC_0);
     _dst = this->tensors().get_const_tensor(TensorType::ACL_DST_0);
@@ -61,16 +61,10 @@
 
 void ClTemplateStore::declare_variables(GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const
 {
-    vtable.declare_variable(
-        comp_group,
-        _src,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "src");
-    vtable.declare_variable(
-        comp_group,
-        _dst,
-        GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
-        "dst");
+    vtable.declare_variable(comp_group, _src, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "src");
+    vtable.declare_variable(comp_group, _dst, GpuKernelArgumentInfo(GpuKernelArgumentInfo::Type::Tensor_4D_t_Buffer),
+                            "dst");
 }
 
 TagLUT ClTemplateStore::get_tag_lut(const GpuKernelVariableTable &vtable, const ComponentGroup &comp_group) const

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
index 3f97a82..b8c82ce 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateStore.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_GPU_TEMPLATE_WRITER_CL_CLTEMPLATESTORE
 
 #include "arm_compute/core/experimental/Types.h"
+
 #include "src/dynamic_fusion/sketch/gpu/template_writer/GpuKernelVariableTable.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 

diff --git a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
index eda15f1..d3d7c8d 100644
--- a/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp
+++ b/src/dynamic_fusion/sketch/gpu/template_writer/cl/ClTemplateWriter.cpp

@@ -24,6 +24,7 @@
 #include "ClTemplateWriter.h"
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/dynamic_fusion/sketch/gpu/components/IGpuKernelComponent.h"
 #include "src/dynamic_fusion/sketch/gpu/template_writer/IGpuTemplateComponentWriter.h"
 
@@ -39,11 +40,11 @@
     std::string replaced_code    = "";
     bool        scanning_pattern = false;
     std::string pattern_found    = "";
-    for(size_t i = 0; i < code_template.size() - 1; ++i)
+    for (size_t i = 0; i < code_template.size() - 1; ++i)
     {
-        if(!scanning_pattern)
+        if (!scanning_pattern)
         {
-            if(code_template[i] == '{' && code_template[i + 1] == '{')
+            if (code_template[i] == '{' && code_template[i + 1] == '{')
             {
                 i += 1;
                 scanning_pattern = true;
@@ -56,7 +57,7 @@
         }
         else
         {
-            if(code_template[i] == '}' && code_template[i + 1] == '}')
+            if (code_template[i] == '}' && code_template[i + 1] == '}')
             {
                 i += 1;
                 scanning_pattern = false;
@@ -76,8 +77,7 @@
 ClTemplateWriter::~ClTemplateWriter()
 {
 }
-ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components)
-    : _components{ components }
+ClTemplateWriter::ClTemplateWriter(const GpuKernelComponentGroup &components) : _components{components}
 {
 }
 std::string ClTemplateWriter::get_name()
@@ -91,7 +91,7 @@
 std::string ClTemplateWriter::get_config_id()
 {
     std::string config_id = get_name();
-    for(const auto &comp : _components)
+    for (const auto &comp : _components)
     {
         config_id += "--" + comp->template_writer()->get_config_id() + "--";
     }
@@ -103,7 +103,7 @@
 {
     CLBuildOptions build_opts{};
 
-    for(const auto &comp : _components)
+    for (const auto &comp : _components)
     {
         build_opts.add_options(comp->template_writer()->get_build_options(_components).options());
     }
@@ -122,11 +122,9 @@
 {
     // Assemble GpuKernelArguments
     std::map<ITensorInfo::Id, GpuKernelArgument> tensors;
-    for(const auto t : _components.get_argument_tensors())
+    for (const auto t : _components.get_argument_tensors())
     {
-        tensors.emplace(
-            t->id(),
-            GpuKernelArgument{ *t, _vtable.get_variable(t).kernel_argument_info });
+        tensors.emplace(t->id(), GpuKernelArgument{*t, _vtable.get_variable(t).kernel_argument_info});
     }
     return tensors;
 }
@@ -141,22 +139,24 @@
     std::vector<std::string> component_codes{}; // vector because order matters
 
     // Pass 1: Declare all kernel variables
-    for(auto &component : _components)
+    for (auto &component : _components)
     {
         component->template_writer()->declare_variables(_vtable, _components);
     }
     // Pass 2: Generate component codes
-    for(auto &component : _components)
+    for (auto &component : _components)
     {
         const auto component_writer       = component->template_writer();
         auto       curr_headers_list      = component_writer->get_headers_list();
         auto       curr_additional_macros = component_writer->get_additional_macros();
         auto       curr_component_code    = component_writer->get_component_code(_components);
-        const auto var_lut                = component_writer->get_tag_lut(_vtable, _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
+        const auto var_lut                = component_writer->get_tag_lut(
+                           _vtable,
+                           _components); // Ideally can be merged with get_component_code once we have finer-grained code generation technique
         component_codes.push_back(replace_tags(curr_component_code, var_lut));
 
         headers_list.insert(curr_headers_list.begin(), curr_headers_list.end());
-        if(!additional_macros.empty()) // Some components might not have any
+        if (!additional_macros.empty()) // Some components might not have any
         {
             additional_macros.insert(replace_tags(curr_additional_macros, var_lut));
         }
@@ -165,7 +165,7 @@
     // Step 3: Assemble the data gathered by traversing the graph into the string "code"
     std::string code = "";
 
-    for(auto &header : headers_list)
+    for (auto &header : headers_list)
     {
 #if defined(EMBEDDED_KERNELS)
         code += CLKernelLibrary::get().get_program(header).first;
@@ -174,16 +174,14 @@
 #endif // defined(EMBEDDED_KERNELS)
     }
 
-    for(auto &macros : additional_macros)
+    for (auto &macros : additional_macros)
     {
         code += macros;
     }
 
     auto arguments = _components.get_argument_tensors();
-    std::sort(arguments.begin(), arguments.end(), [](const ITensorInfo * l, const ITensorInfo * r)
-    {
-        return l->id() < r->id();
-    });
+    std::sort(arguments.begin(), arguments.end(),
+              [](const ITensorInfo *l, const ITensorInfo *r) { return l->id() < r->id(); });
     code += write_kernel_signature(_vtable.get_variable_list(arguments));
 
     code += "\n{\n\n";
@@ -198,7 +196,7 @@
 
         tiles_ss << "    //------------------ START TILE DECLARATION ---------------------\n";
 
-        for(auto tile : tiles)
+        for (auto tile : tiles)
         {
             const auto var       = _vtable.get_variable(tile);
             const auto data_type = get_cl_type_from_data_type(tile->data_type());
@@ -212,7 +210,7 @@
         code += tiles_ss.str();
     }
 
-    for(const auto &component_code : component_codes)
+    for (const auto &component_code : component_codes)
     {
         code += component_code;
         code += "\n";
@@ -231,7 +229,8 @@
     auto       leftover_w = dst_w % tile_w;
 
     std::string code = "";
-    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " + std::to_string(leftover_w) + ");\n";
+    code += std::string("    int g_ind_0 = GET_SPATIAL_IDX(0, ") + std::to_string(tile_w) + ", " +
+            std::to_string(leftover_w) + ");\n";
     code += std::string("    int g_ind_1 = GET_SPATIAL_IDX(1, ") + std::to_string(tile_h) + ", " + "0);\n";
     code += std::string("    int g_ind_2 = GET_SPATIAL_IDX(2, 1, 0);\n\n");
 
@@ -243,7 +242,7 @@
 std::string ClTemplateWriter::write_argument_declaration(const GpuKernelVariableTable::TensorVariable &var) const
 {
     std::string code;
-    switch(var.kernel_argument_info.type)
+    switch (var.kernel_argument_info.type)
     {
         case GpuKernelArgumentInfo::Type::Vector:
         {
@@ -293,11 +292,11 @@
 {
     std::string code = "\n__kernel void " + write_kernel_name() + "(";
 
-    for(int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
+    for (int i = 0; i < static_cast<int>(argument_list.size()) - 1; ++i)
     {
         code += write_argument_declaration(argument_list[i]) + ",";
     }
-    if(static_cast<int>(argument_list.size()) - 1 >= 0)
+    if (static_cast<int>(argument_list.size()) - 1 >= 0)
     {
         code += write_argument_declaration(argument_list[argument_list.size() - 1]);
     }
@@ -308,12 +307,12 @@
 }
 std::string ClTemplateWriter::write_kernel_name() const
 {
-    if(_components.empty())
+    if (_components.empty())
     {
         return "empty_kernel";
     }
     std::string name = _components.empty() ? "" : _components[0]->template_writer()->get_name();
-    for(size_t i = 1; i < _components.size(); ++i)
+    for (size_t i = 1; i < _components.size(); ++i)
     {
         name += "___";
         name += _components[i]->template_writer()->get_name();

diff --git a/src/dynamic_fusion/sketch/utils/DependencyGraph.h b/src/dynamic_fusion/sketch/utils/DependencyGraph.h
index c891e76..c157c2b 100644
--- a/src/dynamic_fusion/sketch/utils/DependencyGraph.h
+++ b/src/dynamic_fusion/sketch/utils/DependencyGraph.h

@@ -25,6 +25,7 @@
 #define SRC_DYNAMIC_FUSION_SKETCH_UTILS_DEPENDENCYGRAPH
 
 #include "arm_compute/core/Error.h"
+
 #include <cstdint>
 #include <map>
 #include <set>
@@ -68,12 +69,10 @@
         OperatorId            op{};
         std::vector<TensorId> inputs{};
         std::vector<TensorId> outputs{};
-        friend bool operator==(const OpPack &opp0, const OpPack &opp1)
+        friend bool           operator==(const OpPack &opp0, const OpPack &opp1)
         {
-            return std::make_tuple(
-                       opp0.op, opp0.inputs, opp0.outputs)
-                   == std::make_tuple(
-                       opp1.op, opp1.inputs, opp1.outputs);
+            return std::make_tuple(opp0.op, opp0.inputs, opp0.outputs) ==
+                   std::make_tuple(opp1.op, opp1.inputs, opp1.outputs);
         }
     };
 
@@ -95,10 +94,13 @@
      * @return true  If the operator can be added while keeping the graph as a linear sequence
      * @return false  Otherwise
      */
-    bool try_add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false) const
+    bool try_add_operator_as_linear(OperatorId                   op,
+                                    const std::vector<TensorId> &inputs,
+                                    const std::vector<TensorId> &outputs,
+                                    bool                         is_output = false) const
     {
         ARM_COMPUTE_UNUSED(op, is_output);
-        if(all_ops().empty())
+        if (all_ops().empty())
         {
             return true;
         }
@@ -106,25 +108,25 @@
         // If the new operator is not the first operator, at least one input tensor must be
         // the output tensor of the last non-output operator. All other input tensors must be
         // the global input of the graph (i.e. not the output of any operator).
-        if(_last_op_available)
+        if (_last_op_available)
         {
             auto use_input_from_last_op = false;
 
-            for(auto src_tensor : inputs)
+            for (auto src_tensor : inputs)
             {
                 const auto src_ops = _adj_src_ops.find(src_tensor);
 
-                if(src_ops != _adj_src_ops.end())
+                if (src_ops != _adj_src_ops.end())
                 {
                     ARM_COMPUTE_ERROR_ON(src_ops->second.size() > 1);
 
-                    if(!src_ops->second.empty())
+                    if (!src_ops->second.empty())
                     {
                         const auto src_op = src_ops->second[0];
 
-                        if(src_op == _last_op)
+                        if (src_op == _last_op)
                         {
-                            if(use_input_from_last_op)
+                            if (use_input_from_last_op)
                             {
                                 // To be safe, we also forbid using the output tensor
                                 // of the last operator twice.
@@ -143,7 +145,7 @@
                 }
             }
 
-            if(!use_input_from_last_op)
+            if (!use_input_from_last_op)
             {
                 // At least one input tensor must be the output tensor of the last non-output operator.
                 return false;
@@ -152,9 +154,9 @@
 
         // The output tensor of the new operator must not be the input tensor of any previously
         // added operator.
-        for(auto dst_tensor : outputs)
+        for (auto dst_tensor : outputs)
         {
-            if(_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end())
+            if (_adj_dst_ops.find(dst_tensor) != _adj_dst_ops.end())
             {
                 return false;
             }
@@ -168,7 +170,10 @@
      * INVARIANT: The list can only grow from head to tail
      * INVARIANT: POSTCONDITION: The graph is linear
      */
-    void add_operator_as_linear(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false)
+    void add_operator_as_linear(OperatorId                   op,
+                                const std::vector<TensorId> &inputs,
+                                const std::vector<TensorId> &outputs,
+                                bool                         is_output = false)
     {
         const auto success = add_operator(op, inputs, outputs, is_output);
         ARM_COMPUTE_UNUSED(success);
@@ -183,24 +188,27 @@
      * @param[in] outputs   Output tensors to the operator
      * @param[in] is_output Whether this is an output operator
      */
-    bool add_operator(OperatorId op, const std::vector<TensorId> &inputs, const std::vector<TensorId> &outputs, bool is_output = false)
+    bool add_operator(OperatorId                   op,
+                      const std::vector<TensorId> &inputs,
+                      const std::vector<TensorId> &outputs,
+                      bool                         is_output = false)
     {
-        if(operator_exists(op))
+        if (operator_exists(op))
         {
             return false;
         }
         _adj_src_tensors[op] = {};
         _adj_dst_tensors[op] = {};
-        for(auto in_tensor : inputs)
+        for (auto in_tensor : inputs)
         {
             // Linking input tensor to operator node will never create a cycle / loop because we guarantee
             // each op is newly created, so every <input, op> pair / edge is new
             link_input(op, in_tensor);
         }
-        for(auto out_tensor : outputs)
+        for (auto out_tensor : outputs)
         {
             // If there exists a back path from op's output tensor to op already, then linking the two will create a loop / cycle
-            if(path_exists_from_tensor_to_op(out_tensor, op))
+            if (path_exists_from_tensor_to_op(out_tensor, op))
             {
                 remove_operator(op);
                 return false;
@@ -211,10 +219,10 @@
             }
         }
 
-        if(!is_output)
+        if (!is_output)
         {
             _last_op_available = true;
-            _last_op = op;
+            _last_op           = op;
         }
 
         return true;
@@ -230,16 +238,16 @@
     std::vector<OpPack> build_operators_sequence() const
     {
         std::vector<OpPack> ops_seq;
-        std::set<Id> done_ops;
-        std::set<Id> done_tensors;
+        std::set<Id>        done_ops;
+        std::set<Id>        done_tensors;
 
         const auto input_tensors = global_src_tensors();
 
-        for(auto tensor : input_tensors)
+        for (auto tensor : input_tensors)
         {
             done_tensors.insert(tensor);
 
-            for(auto op : _adj_dst_ops.at(tensor))
+            for (auto op : _adj_dst_ops.at(tensor))
             {
                 build_operators_sequence_from_op(op, ops_seq, done_ops, done_tensors);
             }
@@ -260,10 +268,8 @@
     friend bool operator==(const DependencyGraph &g0, const DependencyGraph &g1)
     {
         // Do not compare id allocators
-        return std::make_tuple(
-                   g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops)
-               == std::make_tuple(
-                   g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
+        return std::make_tuple(g0._adj_src_tensors, g0._adj_dst_tensors, g0._adj_src_ops, g0._adj_dst_ops) ==
+               std::make_tuple(g1._adj_src_tensors, g1._adj_dst_tensors, g1._adj_src_ops, g1._adj_dst_ops);
     }
     std::vector<OperatorId> src_ops_from_tensor(TensorId tensor) const
     {
@@ -280,10 +286,8 @@
     std::vector<TensorId> all_tensors() const
     {
         std::vector<TensorId> tensors{};
-        std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors), [](const auto & it)
-        {
-            return it.first;
-        });
+        std::transform(std::begin(_adj_src_ops), std::end(_adj_src_ops), std::back_inserter(tensors),
+                       [](const auto &it) { return it.first; });
         return tensors;
     }
     /** Get source tensors of the whole graph
@@ -293,9 +297,9 @@
     std::vector<TensorId> global_src_tensors() const
     {
         std::vector<TensorId> tensors;
-        for(auto tensor_src_ops : _adj_src_ops)
+        for (auto tensor_src_ops : _adj_src_ops)
         {
-            if(tensor_src_ops.second.empty())
+            if (tensor_src_ops.second.empty())
             {
                 tensors.push_back(tensor_src_ops.first);
             }
@@ -309,9 +313,9 @@
     std::vector<TensorId> global_dst_tensors() const
     {
         std::vector<TensorId> tensors;
-        for(auto tensor_dst_ops : _adj_dst_ops)
+        for (auto tensor_dst_ops : _adj_dst_ops)
         {
-            if(tensor_dst_ops.second.empty())
+            if (tensor_dst_ops.second.empty())
             {
                 tensors.push_back(tensor_dst_ops.first);
             }
@@ -328,14 +332,14 @@
 
         // If a tensor is used to connect the input of an operator and the output of another operator,
         // it is not allocated in the memory. The tensor exists as a temporary variable only.
-        for(auto src_tensor : _adj_src_ops)
+        for (auto src_tensor : _adj_src_ops)
         {
-            if(!src_tensor.second.empty())
+            if (!src_tensor.second.empty())
             {
                 const auto dst_tensor = _adj_dst_ops.find(src_tensor.first);
-                if(dst_tensor != _adj_dst_ops.end())
+                if (dst_tensor != _adj_dst_ops.end())
                 {
-                    if(!dst_tensor->second.empty())
+                    if (!dst_tensor->second.empty())
                     {
                         tensors.push_back(src_tensor.first);
                     }
@@ -354,9 +358,9 @@
         std::vector<OperatorId> ops{};
         const auto              op_list = all_ops();
 
-        for(auto op : op_list)
+        for (auto op : op_list)
         {
-            if(src_ops(op).empty())
+            if (src_ops(op).empty())
             {
                 ops.emplace_back(op);
             }
@@ -368,7 +372,7 @@
     void link_input(OperatorId op, TensorId in_tensor)
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
-        if(!tensor_exists(in_tensor))
+        if (!tensor_exists(in_tensor))
         {
             insert_new_tensor(in_tensor);
         }
@@ -379,7 +383,7 @@
     void link_output(OperatorId op, TensorId out_tensor)
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
-        if(!tensor_exists(out_tensor))
+        if (!tensor_exists(out_tensor))
         {
             insert_new_tensor(out_tensor);
         }
@@ -392,7 +396,7 @@
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
         std::vector<OperatorId> ops{};
-        for(TensorId src_tensor : src_tensors(op))
+        for (TensorId src_tensor : src_tensors(op))
         {
             ops.insert(ops.end(), std::begin(_adj_src_ops.at(src_tensor)), std::end(_adj_src_ops.at(src_tensor)));
         }
@@ -402,7 +406,7 @@
     {
         ARM_COMPUTE_ERROR_ON(!operator_exists(op));
         std::vector<OperatorId> ops{};
-        for(TensorId dst_tensor : _adj_dst_tensors.at(op))
+        for (TensorId dst_tensor : _adj_dst_tensors.at(op))
         {
             ops.insert(ops.end(), std::begin(_adj_dst_ops.at(dst_tensor)), std::end(_adj_dst_ops.at(dst_tensor)));
         }
@@ -436,10 +440,8 @@
     std::vector<OperatorId> all_ops() const
     {
         std::vector<OperatorId> ops{};
-        std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops), [](const auto & it)
-        {
-            return it.first;
-        });
+        std::transform(std::begin(_adj_src_tensors), std::end(_adj_src_tensors), std::back_inserter(ops),
+                       [](const auto &it) { return it.first; });
         return ops;
     }
     /** Remove an operator from graph.
@@ -448,25 +450,21 @@
      */
     void remove_operator(OperatorId op)
     {
-        for(auto src_tensor : _adj_src_tensors.at(op))
+        for (auto src_tensor : _adj_src_tensors.at(op))
         {
             auto &dst_ops = _adj_dst_ops.at(src_tensor);
-            dst_ops.erase(
-                std::remove(std::begin(dst_ops), std::end(dst_ops), op),
-                std::end(dst_ops));
+            dst_ops.erase(std::remove(std::begin(dst_ops), std::end(dst_ops), op), std::end(dst_ops));
         }
-        for(auto dst_tensor : _adj_dst_tensors.at(op))
+        for (auto dst_tensor : _adj_dst_tensors.at(op))
         {
             auto &src_ops = _adj_src_ops.at(dst_tensor);
-            src_ops.erase(
-                std::remove(std::begin(src_ops), std::end(src_ops), op),
-                std::end(src_ops));
+            src_ops.erase(std::remove(std::begin(src_ops), std::end(src_ops), op), std::end(src_ops));
         }
         // Remove any isolated tensors
         // An isolated tensor is one where both its _adj_src_ops and _adj_dst_ops are empty
-        for(auto t : all_tensors())
+        for (auto t : all_tensors())
         {
-            if(_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
+            if (_adj_src_ops.at(t).empty() && _adj_dst_ops.at(t).empty())
             {
                 _adj_src_ops.erase(t);
                 _adj_dst_ops.erase(t);
@@ -486,11 +484,12 @@
     }
     bool operator_exists(OperatorId op) const
     {
-        return _adj_src_tensors.find(op) != _adj_src_tensors.end() && _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
+        return _adj_src_tensors.find(op) != _adj_src_tensors.end() &&
+               _adj_dst_tensors.find(op) != _adj_dst_tensors.end();
     }
     bool is_src_tensor_of(OperatorId op, TensorId tensor) const
     {
-        if(!operator_exists(op) || !tensor_exists(tensor))
+        if (!operator_exists(op) || !tensor_exists(tensor))
         {
             return false;
         }
@@ -499,7 +498,7 @@
     }
     bool is_dst_tensor_of(OperatorId op, TensorId tensor) const
     {
-        if(!operator_exists(op) || !tensor_exists(tensor))
+        if (!operator_exists(op) || !tensor_exists(tensor))
         {
             return false;
         }
@@ -525,9 +524,9 @@
         std::vector<OperatorId> ops{};
         const auto              op_list = all_ops();
 
-        for(auto op : op_list)
+        for (auto op : op_list)
         {
-            if(is_dst_op(op))
+            if (is_dst_op(op))
             {
                 ops.emplace_back(op);
             }
@@ -536,13 +535,13 @@
     }
     bool path_exists_from_tensor_to_op(TensorId src_tensor, OperatorId dst_op) const
     {
-        if(!tensor_exists(src_tensor) || !operator_exists(dst_op))
+        if (!tensor_exists(src_tensor) || !operator_exists(dst_op))
         {
             return false;
         }
-        for(auto child_op : dst_ops_from_tensor(src_tensor))
+        for (auto child_op : dst_ops_from_tensor(src_tensor))
         {
-            if(path_exists_from_op_to_op(child_op, dst_op))
+            if (path_exists_from_op_to_op(child_op, dst_op))
             {
                 return true;
             }
@@ -552,21 +551,21 @@
 
     bool path_exists_from_op_to_op(OperatorId src_op, OperatorId dst_op) const
     {
-        if(!operator_exists(src_op) || !operator_exists(dst_op))
+        if (!operator_exists(src_op) || !operator_exists(dst_op))
         {
             return false;
         }
-        if(src_op == dst_op)
+        if (src_op == dst_op)
         {
             return true;
         }
-        if(is_in(src_op, get_dst_ops()))
+        if (is_in(src_op, get_dst_ops()))
         {
             return false;
         }
-        for(auto child_tensor : dst_tensors(src_op))
+        for (auto child_tensor : dst_tensors(src_op))
         {
-            if(path_exists_from_tensor_to_op(child_tensor, dst_op))
+            if (path_exists_from_tensor_to_op(child_tensor, dst_op))
             {
                 return true;
             }
@@ -574,16 +573,15 @@
         return false;
     }
 
-    void build_operators_sequence_from_op(
-        Id op,
-        std::vector<OpPack> &ops_seq,
-        std::set<Id> &done_ops,
-        std::set<Id> &done_tensors) const
+    void build_operators_sequence_from_op(Id                   op,
+                                          std::vector<OpPack> &ops_seq,
+                                          std::set<Id>        &done_ops,
+                                          std::set<Id>        &done_tensors) const
     {
-        while(true)
+        while (true)
         {
             // If the operator has been added to the sequence, ignore it.
-            if(done_ops.find(op) != done_ops.end())
+            if (done_ops.find(op) != done_ops.end())
             {
                 return;
             }
@@ -593,9 +591,9 @@
             // is added to the sequence.
             const auto src_tensors = _adj_src_tensors.at(op);
 
-            for(auto src : src_tensors)
+            for (auto src : src_tensors)
             {
-                if(done_tensors.find(src) == done_tensors.end())
+                if (done_tensors.find(src) == done_tensors.end())
                 {
                     return;
                 }
@@ -606,24 +604,24 @@
 
             done_ops.insert(op);
 
-            OpPack pack{ op, src_tensors, dst_tensors };
+            OpPack pack{op, src_tensors, dst_tensors};
             ops_seq.push_back(pack);
 
             done_tensors.insert(dst_tensors.begin(), dst_tensors.end());
 
             // Visit all the sink operators.
             // Call this function recursively unless there is only one sink.
-            if(dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1)
+            if (dst_tensors.size() == 1 && _adj_dst_ops.at(dst_tensors[0]).size() == 1)
             {
                 op = _adj_dst_ops.at(dst_tensors[0])[0];
             }
             else
             {
-                for(auto dst_tensor : dst_tensors)
+                for (auto dst_tensor : dst_tensors)
                 {
                     const auto dst_ops = _adj_dst_ops.at(dst_tensor);
 
-                    for(auto dst_op : dst_ops)
+                    for (auto dst_op : dst_ops)
                     {
                         build_operators_sequence_from_op(dst_op, ops_seq, done_ops, done_tensors);
                     }
@@ -640,8 +638,8 @@
     AdjList _adj_src_ops{};
     AdjList _adj_dst_ops{};
 
-    bool _last_op_available{ false };
-    OperatorId _last_op{ 0 };
+    bool       _last_op_available{false};
+    OperatorId _last_op{0};
 };
 
 } // namespace dynamic_fusion

diff --git a/src/dynamic_fusion/utils/Utils.h b/src/dynamic_fusion/utils/Utils.h
index c9fc2c6..3f4a2ed 100644
--- a/src/dynamic_fusion/utils/Utils.h
+++ b/src/dynamic_fusion/utils/Utils.h

@@ -63,17 +63,21 @@
 
 /** Inline function to convert @ref Pool2dAttributes to PoolingLayerInfo
 */
-inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr, bool mixed_precision = false, DataLayout data_layout = DataLayout::NHWC)
+inline PoolingLayerInfo convert_pool_attr_to_pool_info(const Pool2dAttributes &pool_attr,
+                                                       bool                    mixed_precision = false,
+                                                       DataLayout              data_layout     = DataLayout::NHWC)
 {
     // Create PadStrideInfo
     const Size2D        stride  = pool_attr.stride();
     const Padding2D     padding = pool_attr.pad();
-    const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top, arm_compute::DimensionRoundingType::FLOOR);
+    const PadStrideInfo pad_stride(stride.x(), stride.y(), padding.left, padding.top,
+                                   arm_compute::DimensionRoundingType::FLOOR);
 
-    return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride, pool_attr.exclude_padding(), mixed_precision);
+    return PoolingLayerInfo(pool_attr.pool_type(), pool_attr.pool_size(), data_layout, pad_stride,
+                            pool_attr.exclude_padding(), mixed_precision);
 }
-}
-}
-}
+} // namespace dynamic_fusion
+} // namespace experimental
+} // namespace arm_compute
 
 #endif /* SRC_DYNAMIC_FUSION_UTILS_UTILS */

diff --git a/src/gpu/cl/ClContext.cpp b/src/gpu/cl/ClContext.cpp
index d8ef18e..611c1cb 100644
--- a/src/gpu/cl/ClContext.cpp
+++ b/src/gpu/cl/ClContext.cpp

@@ -23,11 +23,11 @@
  */
 #include "src/gpu/cl/ClContext.h"
 
+#include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/ClQueue.h"
 #include "src/gpu/cl/ClTensor.h"
 
-#include "arm_compute/core/CL/CLKernelLibrary.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -41,7 +41,7 @@
     bool                 status = false;
     mlgo::MLGOHeuristics heuristics;
 
-    if(filename != nullptr)
+    if (filename != nullptr)
     {
         status = heuristics.reload_from_file(filename);
     }
@@ -50,12 +50,9 @@
 } // namespace
 
 ClContext::ClContext(const AclContextOptions *options)
-    : IContext(Target::GpuOcl),
-      _mlgo_heuristics(),
-      _cl_ctx(),
-      _cl_dev()
+    : IContext(Target::GpuOcl), _mlgo_heuristics(), _cl_ctx(), _cl_dev()
 {
-    if(options != nullptr)
+    if (options != nullptr)
     {
         _mlgo_heuristics = populate_mlgo(options->kernel_config_file);
     }
@@ -80,7 +77,7 @@
 
 bool ClContext::set_cl_ctx(::cl::Context ctx)
 {
-    if(this->refcount() == 0)
+    if (this->refcount() == 0)
     {
         _cl_ctx = ctx;
         CLScheduler::get().set_context(ctx);
@@ -92,7 +89,7 @@
 ITensorV2 *ClContext::create_tensor(const AclTensorDescriptor &desc, bool allocate)
 {
     ClTensor *tensor = new ClTensor(this, desc);
-    if(tensor != nullptr && allocate)
+    if (tensor != nullptr && allocate)
     {
         tensor->allocate();
     }

diff --git a/src/gpu/cl/ClContext.h b/src/gpu/cl/ClContext.h
index a50b031..2c67ccf 100644
--- a/src/gpu/cl/ClContext.h
+++ b/src/gpu/cl/ClContext.h

@@ -24,11 +24,11 @@
 #ifndef SRC_GPU_CLCONTEXT_H
 #define SRC_GPU_CLCONTEXT_H
 
+#include "arm_compute/core/CL/OpenCL.h"
+
 #include "src/common/IContext.h"
 #include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 
-#include "arm_compute/core/CL/OpenCL.h"
-
 namespace arm_compute
 {
 namespace gpu
@@ -74,9 +74,9 @@
     bool set_cl_ctx(::cl::Context ctx);
 
     // Inherrited methods overridden
-    ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
-    IQueue *create_queue(const AclQueueOptions *options) override;
-    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src,
+    ITensorV2                          *create_tensor(const AclTensorDescriptor &desc, bool allocate) override;
+    IQueue                             *create_queue(const AclQueueOptions *options) override;
+    std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor     &src,
                                                           const AclTensorDescriptor     &dst,
                                                           const AclActivationDescriptor &act,
                                                           bool                           is_validate) override;
@@ -90,4 +90,4 @@
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLCONTEXT_H */
\ No newline at end of file
+#endif /* SRC_GPU_CLCONTEXT_H */

diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index 73bb962..bcade94 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp

@@ -37,24 +37,16 @@
 namespace
 {
 /* Decoding table */
-constexpr std::array<uint8_t, 256> b64_invtab =
-{
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63,
-    52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0,
-    0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
-    15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0,
-    0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
-    41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
-    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+constexpr std::array<uint8_t, 256> b64_invtab = {
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  62, 0,  0,  0,  63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+    0,  0,  0,  0,  0,  0,  0,  0, 1, 2, 3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+    22, 23, 24, 25, 0,  0,  0,  0, 0, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+    45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+    0,  0,  0,  0,  0,  0,  0,  0, 0, 0, 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
 };
 
 /** Decode a base64 encoded string
@@ -68,13 +60,13 @@
     constexpr const char pad_char = '=';
 
     // Handle empty string
-    if(str.empty())
+    if (str.empty())
     {
         return {};
     }
 
     // Base64 encoded string has size multiple of 4
-    if(str.length() % 4)
+    if (str.length() % 4)
     {
         return {};
     }
@@ -92,7 +84,7 @@
     // Block decoding function (exclude padding)
     int       c   = 0;
     const int end = str_len - 4 - padding;
-    for(; c <= end; c += 4)
+    for (; c <= end; c += 4)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -105,7 +97,7 @@
     }
 
     // Last step that might contain padding symbols
-    if(padding == 1)
+    if (padding == 1)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -114,7 +106,7 @@
         dec_b64.push_back((byte0 << 2) | (byte1 >> 4));
         dec_b64.push_back((byte1 << 4) | (byte2 >> 2));
     }
-    else if(padding == 2)
+    else if (padding == 2)
     {
         const int byte0 = b64_invtab[str[c]];
         const int byte1 = b64_invtab[str[c + 1]];
@@ -135,7 +127,7 @@
 {
     // Create and initialize decompression stream
     z_stream ds{};
-    if(inflateInit(&ds) != Z_OK)
+    if (inflateInit(&ds) != Z_OK)
     {
         return std::string();
     }
@@ -152,16 +144,15 @@
         ds.next_out  = reinterpret_cast<Bytef *>(roll_buff);
 
         status = inflate(&ds, 0);
-        if(inflated_str.size() < ds.total_out)
+        if (inflated_str.size() < ds.total_out)
         {
             inflated_str.append(roll_buff, ds.total_out - inflated_str.size());
         }
-    }
-    while(status == Z_OK);
+    } while (status == Z_OK);
 
     // Finalize decompression stream
     inflateEnd(&ds);
-    if(status != Z_STREAM_END)
+    if (status != Z_STREAM_END)
     {
         return std::string();
     }
@@ -175,323 +166,321 @@
 {
 namespace opencl
 {
-const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
-{
+const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = {
     // Common Kernels
-    { "activation_layer", "common/activation_layer.cl" },
-    { "activation_layer_quant", "common/activation_layer_quant.cl" },
-    { "activation_layer_quant_f32", "common/activation_layer_quant.cl" },
-    { "arg_min_max_x", "common/arg_min_max.cl" },
-    { "arg_min_max_y", "common/arg_min_max.cl" },
-    { "arg_min_max_z", "common/arg_min_max.cl" },
-    { "arg_min_max_w", "common/arg_min_max.cl" },
-    { "bitwise_or", "common/bitwise_op.cl" },
-    { "bitwise_and", "common/bitwise_op.cl" },
-    { "bitwise_xor", "common/bitwise_op.cl" },
-    { "bitwise_not", "common/bitwise_op.cl" },
-    { "bounding_box_transform", "common/bounding_box_transform.cl" },
-    { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" },
-    { "compare_equal", "common/comparisons.cl" },
-    { "compare_equal_quantized", "common/comparisons.cl" },
-    { "compare_notequal", "common/comparisons.cl" },
-    { "compare_notequal_quantized", "common/comparisons.cl" },
-    { "compare_greater", "common/comparisons.cl" },
-    { "compare_greater_quantized", "common/comparisons.cl" },
-    { "compare_greaterequal", "common/comparisons.cl" },
-    { "compare_greaterequal_quantized", "common/comparisons.cl" },
-    { "compare_less", "common/comparisons.cl" },
-    { "compare_less_quantized", "common/comparisons.cl" },
-    { "compare_lessequal", "common/comparisons.cl" },
-    { "compare_lessequal_quantized", "common/comparisons.cl" },
-    { "concatenate", "common/concatenate.cl" },
-    { "concatenate_width", "common/concatenate.cl" },
-    { "concatenate_height", "common/concatenate.cl" },
-    { "concatenate_width_x2", "common/concatenate.cl" },
-    { "concatenate_width_x4", "common/concatenate.cl" },
-    { "col2im", "common/col2im.cl" },
-    { "cast_down", "common/cast.cl" },
-    { "cast_up", "common/cast.cl" },
-    { "convert_fc_weights", "common/convert_fc_weights.cl" },
-    { "copy_tensor", "common/copy_tensor.cl" },
-    { "crop_tensor", "common/crop_tensor.cl" },
-    { "deconvolution_reshape", "common/deconvolution_layer.cl" },
-    { "deconvolution_upsample", "common/deconvolution_layer.cl" },
-    { "dequantization_layer", "common/dequantization_layer.cl" },
-    { "elementwise_operation_ADD", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SUB", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MAX", "common/elementwise_operation.cl" },
-    { "elementwise_operation_MIN", "common/elementwise_operation.cl" },
-    { "elementwise_operation_DIV", "common/elementwise_operation.cl" },
-    { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" },
-    { "elementwise_operation_POWER", "common/elementwise_operation.cl" },
-    { "elementwise_operation_PRELU", "common/elementwise_operation.cl" },
-    { "elementwise_operation_AND", "common/elementwise_operation.cl" },
-    { "elementwise_operation_OR", "common/elementwise_operation.cl" },
-    { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" },
-    { "elementwise_unary", "common/elementwise_unary.cl" },
-    { "elementwise_unary_quantized", "common/elementwise_unary_quantized.cl" },
-    { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" },
-    { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" },
-    { "fft_radix_2_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_2_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_2_axis_0", "common/fft.cl" },
-    { "fft_radix_2_axis_1", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_3_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_3_axis_0", "common/fft.cl" },
-    { "fft_radix_3_axis_1", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_4_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_4_axis_0", "common/fft.cl" },
-    { "fft_radix_4_axis_1", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_5_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_5_axis_0", "common/fft.cl" },
-    { "fft_radix_5_axis_1", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_7_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_7_axis_0", "common/fft.cl" },
-    { "fft_radix_7_axis_1", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_0", "common/fft.cl" },
-    { "fft_radix_8_first_stage_axis_1", "common/fft.cl" },
-    { "fft_radix_8_axis_0", "common/fft.cl" },
-    { "fft_radix_8_axis_1", "common/fft.cl" },
-    { "fft_scale_conj", "common/fft_scale.cl" },
-    { "fill_image_borders_constant", "common/fill_border.cl" },
-    { "fill_image_borders_replicate", "common/fill_border.cl" },
-    { "floor_layer", "common/floor.cl" },
-    { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" },
-    { "gather", "common/gather.cl" },
-    { "gemm_ma_f16", "common/gemm.cl" },
-    { "gemm_ma_f32", "common/gemm.cl" },
-    { "gemm_mv", "common/gemv.cl" },
-    { "gemm_mv_quantized", "common/gemv.cl" },
-    { "gemm_mm_native", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
-    { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
-    { "gemm_lc_vm_f32", "common/gemm.cl" },
-    { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" },
-    { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" },
-    { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" },
-    { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" },
-    { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
-    { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_native", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl" },
-    { "gemmlowp_offset_contribution", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" },
-    { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" },
-    { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" },
-    { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" },
-    { "instance_normalization", "common/instance_normalization.cl" },
-    { "compute_mean_var", "common/instance_normalization.cl" },
-    { "l2_normalize_x", "common/l2_normalize.cl" },
-    { "l2_normalize_y", "common/l2_normalize.cl" },
-    { "l2_normalize_z", "common/l2_normalize.cl" },
-    { "mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl" },
-    { "mat_mul_native_nt_nt", "common/mat_mul.cl" },
-    { "mat_mul_native_nt_t", "common/mat_mul.cl" },
-    { "mat_mul_native_t_nt", "common/mat_mul.cl" },
-    { "mat_mul_native_t_t", "common/mat_mul.cl" },
-    { "mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl" },
-    { "mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl" },
-    { "mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl" },
-    { "mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl" },
-    { "mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl" },
-    { "max_unpooling_layer_2", "common/unpooling_layer.cl" },
-    { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" },
-    { "memset", "common/memset.cl" },
-    { "minmax_layer", "common/minmax_layer.cl" },
-    { "non_max_suppression", "common/nonmax.cl" },
-    { "pad_layer_constant", "common/pad_layer.cl" },
-    { "pad_layer_symmetric_reflect", "common/pad_layer.cl" },
-    { "permute", "common/permute.cl" },
-    { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" },
-    { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" },
-    { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" },
-    { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" },
-    { "quantization_layer", "common/quantization_layer.cl" },
-    { "range", "common/range.cl" },
-    { "range_quantized", "common/range.cl" },
-    { "reduction_operation_x", "common/reduction_operation.cl" },
-    { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" },
-    { "reduction_operation_y", "common/reduction_operation.cl" },
-    { "reduction_operation_z", "common/reduction_operation.cl" },
-    { "reduction_operation_w", "common/reduction_operation.cl" },
-    { "reshape_layer", "common/reshape_layer.cl" },
-    { "reshape_to_columns", "common/convolution_layer.cl" },
-    { "reverse", "common/reverse.cl" },
-    { "roi_align_layer", "common/roi_align_layer.cl" },
-    { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" },
-    { "roi_pooling_layer", "common/roi_pooling_layer.cl" },
-    { "select_same_rank", "common/select.cl" },
-    { "select_different_rank_2", "common/select.cl" },
-    { "select_different_rank_n", "common/select.cl" },
-    { "softmax_layer_norm", "common/softmax_layer.cl" },
-    { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" },
-    { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" },
-    { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" },
-    { "stack_layer", "common/stack_layer.cl" },
-    { "strided_slice", "common/slice_ops.cl" },
-    { "tile", "common/tile.cl" },
-    { "transpose", "common/transpose.cl" },
+    {"activation_layer", "common/activation_layer.cl"},
+    {"activation_layer_quant", "common/activation_layer_quant.cl"},
+    {"activation_layer_quant_f32", "common/activation_layer_quant.cl"},
+    {"arg_min_max_x", "common/arg_min_max.cl"},
+    {"arg_min_max_y", "common/arg_min_max.cl"},
+    {"arg_min_max_z", "common/arg_min_max.cl"},
+    {"arg_min_max_w", "common/arg_min_max.cl"},
+    {"bitwise_or", "common/bitwise_op.cl"},
+    {"bitwise_and", "common/bitwise_op.cl"},
+    {"bitwise_xor", "common/bitwise_op.cl"},
+    {"bitwise_not", "common/bitwise_op.cl"},
+    {"bounding_box_transform", "common/bounding_box_transform.cl"},
+    {"bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl"},
+    {"compare_equal", "common/comparisons.cl"},
+    {"compare_equal_quantized", "common/comparisons.cl"},
+    {"compare_notequal", "common/comparisons.cl"},
+    {"compare_notequal_quantized", "common/comparisons.cl"},
+    {"compare_greater", "common/comparisons.cl"},
+    {"compare_greater_quantized", "common/comparisons.cl"},
+    {"compare_greaterequal", "common/comparisons.cl"},
+    {"compare_greaterequal_quantized", "common/comparisons.cl"},
+    {"compare_less", "common/comparisons.cl"},
+    {"compare_less_quantized", "common/comparisons.cl"},
+    {"compare_lessequal", "common/comparisons.cl"},
+    {"compare_lessequal_quantized", "common/comparisons.cl"},
+    {"concatenate", "common/concatenate.cl"},
+    {"concatenate_width", "common/concatenate.cl"},
+    {"concatenate_height", "common/concatenate.cl"},
+    {"concatenate_width_x2", "common/concatenate.cl"},
+    {"concatenate_width_x4", "common/concatenate.cl"},
+    {"col2im", "common/col2im.cl"},
+    {"cast_down", "common/cast.cl"},
+    {"cast_up", "common/cast.cl"},
+    {"convert_fc_weights", "common/convert_fc_weights.cl"},
+    {"copy_tensor", "common/copy_tensor.cl"},
+    {"crop_tensor", "common/crop_tensor.cl"},
+    {"deconvolution_reshape", "common/deconvolution_layer.cl"},
+    {"deconvolution_upsample", "common/deconvolution_layer.cl"},
+    {"dequantization_layer", "common/dequantization_layer.cl"},
+    {"elementwise_operation_ADD", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SUB", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MAX", "common/elementwise_operation.cl"},
+    {"elementwise_operation_MIN", "common/elementwise_operation.cl"},
+    {"elementwise_operation_DIV", "common/elementwise_operation.cl"},
+    {"elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl"},
+    {"elementwise_operation_POWER", "common/elementwise_operation.cl"},
+    {"elementwise_operation_PRELU", "common/elementwise_operation.cl"},
+    {"elementwise_operation_AND", "common/elementwise_operation.cl"},
+    {"elementwise_operation_OR", "common/elementwise_operation.cl"},
+    {"elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl"},
+    {"elementwise_unary", "common/elementwise_unary.cl"},
+    {"elementwise_unary_quantized", "common/elementwise_unary_quantized.cl"},
+    {"fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl"},
+    {"fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl"},
+    {"fft_radix_2_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_2_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_2_axis_0", "common/fft.cl"},
+    {"fft_radix_2_axis_1", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_3_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_3_axis_0", "common/fft.cl"},
+    {"fft_radix_3_axis_1", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_4_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_4_axis_0", "common/fft.cl"},
+    {"fft_radix_4_axis_1", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_5_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_5_axis_0", "common/fft.cl"},
+    {"fft_radix_5_axis_1", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_7_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_7_axis_0", "common/fft.cl"},
+    {"fft_radix_7_axis_1", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_0", "common/fft.cl"},
+    {"fft_radix_8_first_stage_axis_1", "common/fft.cl"},
+    {"fft_radix_8_axis_0", "common/fft.cl"},
+    {"fft_radix_8_axis_1", "common/fft.cl"},
+    {"fft_scale_conj", "common/fft_scale.cl"},
+    {"fill_image_borders_constant", "common/fill_border.cl"},
+    {"fill_image_borders_replicate", "common/fill_border.cl"},
+    {"floor_layer", "common/floor.cl"},
+    {"fuse_batchnormalization_layer", "common/batchnormalization_layer.cl"},
+    {"gather", "common/gather.cl"},
+    {"gemm_ma_f16", "common/gemm.cl"},
+    {"gemm_ma_f32", "common/gemm.cl"},
+    {"gemm_mv", "common/gemv.cl"},
+    {"gemm_mv_quantized", "common/gemv.cl"},
+    {"gemm_mm_native", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t", "common/gemm.cl"},
+    {"gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl"},
+    {"gemm_lc_vm_f32", "common/gemm.cl"},
+    {"gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl"},
+    {"gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl"},
+    {"gemmlowp_matrix_a_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl"},
+    {"gemmlowp_matrix_b_reduction", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_native", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl"},
+    {"gemmlowp_offset_contribution", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl"},
+    {"gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl"},
+    {"generate_proposals_compute_all_anchors", "common/generate_proposals.cl"},
+    {"generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl"},
+    {"instance_normalization", "common/instance_normalization.cl"},
+    {"compute_mean_var", "common/instance_normalization.cl"},
+    {"l2_normalize_x", "common/l2_normalize.cl"},
+    {"l2_normalize_y", "common/l2_normalize.cl"},
+    {"l2_normalize_z", "common/l2_normalize.cl"},
+    {"mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl"},
+    {"mat_mul_native_nt_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_nt_t", "common/mat_mul.cl"},
+    {"mat_mul_native_t_nt", "common/mat_mul.cl"},
+    {"mat_mul_native_t_t", "common/mat_mul.cl"},
+    {"mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl"},
+    {"mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl"},
+    {"mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl"},
+    {"max_unpooling_layer_2", "common/unpooling_layer.cl"},
+    {"mean_stddev_normalization", "common/mean_stddev_normalization.cl"},
+    {"memset", "common/memset.cl"},
+    {"minmax_layer", "common/minmax_layer.cl"},
+    {"non_max_suppression", "common/nonmax.cl"},
+    {"pad_layer_constant", "common/pad_layer.cl"},
+    {"pad_layer_symmetric_reflect", "common/pad_layer.cl"},
+    {"permute", "common/permute.cl"},
+    {"pixelwise_mul_complex", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_float", "common/pixelwise_mul_float.cl"},
+    {"pixelwise_mul_int", "common/pixelwise_mul_int.cl"},
+    {"pixelwise_mul_quantized", "common/pixelwise_mul_int.cl"},
+    {"qlstm_layer_normalization", "common/qlstm_layer_normalization.cl"},
+    {"quantization_layer", "common/quantization_layer.cl"},
+    {"range", "common/range.cl"},
+    {"range_quantized", "common/range.cl"},
+    {"reduction_operation_x", "common/reduction_operation.cl"},
+    {"reduction_operation_non_parallel_x", "common/reduction_operation.cl"},
+    {"reduction_operation_y", "common/reduction_operation.cl"},
+    {"reduction_operation_z", "common/reduction_operation.cl"},
+    {"reduction_operation_w", "common/reduction_operation.cl"},
+    {"reshape_layer", "common/reshape_layer.cl"},
+    {"reshape_to_columns", "common/convolution_layer.cl"},
+    {"reverse", "common/reverse.cl"},
+    {"roi_align_layer", "common/roi_align_layer.cl"},
+    {"roi_align_layer_quantized", "common/roi_align_layer_quantized.cl"},
+    {"roi_pooling_layer", "common/roi_pooling_layer.cl"},
+    {"select_same_rank", "common/select.cl"},
+    {"select_different_rank_2", "common/select.cl"},
+    {"select_different_rank_n", "common/select.cl"},
+    {"softmax_layer_norm", "common/softmax_layer.cl"},
+    {"softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl"},
+    {"softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl"},
+    {"softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl"},
+    {"stack_layer", "common/stack_layer.cl"},
+    {"strided_slice", "common/slice_ops.cl"},
+    {"tile", "common/tile.cl"},
+    {"transpose", "common/transpose.cl"},
 #ifdef ENABLE_NCHW_KERNELS
-    { "batch_to_space_nchw", "nchw/batch_to_space.cl" },
-    { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" },
-    { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" },
-    { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" },
-    { "depth_to_space_nchw", "nchw/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" },
-    { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" },
-    { "direct_convolution_nchw", "nchw/direct_convolution.cl" },
+    {"batch_to_space_nchw", "nchw/batch_to_space.cl"},
+    {"batch_to_space_static_nchw", "nchw/batch_to_space.cl"},
+    {"batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl"},
+    {"channel_shuffle_nchw", "nchw/channel_shuffle.cl"},
+    {"depth_to_space_nchw", "nchw/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl"},
+    {"direct_convolution1x1", "nchw/direct_convolution1x1.cl"},
+    {"direct_convolution_nchw", "nchw/direct_convolution.cl"},
 
-    { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" },
-    { "im2col3x3_nchw", "nchw/im2col.cl" },
-    { "im2col5x5_nchw", "nchw/im2col.cl" },
-    { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_nchw", "nchw/im2col.cl" },
-    { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" },
-    { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
-    { "pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl" },
-    { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
-    { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
-    { "scale_bilinear_nchw", "nchw/scale.cl" },
-    { "space_to_batch_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
-    { "space_to_depth_nchw", "nchw/space_to_depth.cl" },
-    { "upsample_layer_nchw", "nchw/upsample_layer.cl" },
-    { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
-    { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" },
+    {"im2col1x1_stridex1_nchw", "nchw/im2col.cl"},
+    {"im2col3x3_nchw", "nchw/im2col.cl"},
+    {"im2col5x5_nchw", "nchw/im2col.cl"},
+    {"im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_nchw", "nchw/im2col.cl"},
+    {"im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl"},
+    {"normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalization_layer_in_map_nchw", "nchw/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nchw", "nchw/pooling_layer.cl"},
+    {"pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl"},
+    {"prior_box_layer_nchw", "nchw/prior_box_layer.cl"},
+    {"reorg_layer_nchw", "nchw/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nchw", "nchw/scale.cl"},
+    {"scale_bilinear_nchw", "nchw/scale.cl"},
+    {"space_to_batch_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_batch_static_nchw", "nchw/space_to_batch.cl"},
+    {"space_to_depth_nchw", "nchw/space_to_depth.cl"},
+    {"upsample_layer_nchw", "nchw/upsample_layer.cl"},
+    {"winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl"},
+    {"winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl"},
 #endif /* ENABLE_NCHW_KERNELS */
 #ifdef ENABLE_NHWC_KERNELS
-    { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" },
-    { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" },
-    { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" },
-    { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" },
-    { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" },
-    { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" },
-    { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" },
-    { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" },
-    { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" },
-    { "direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl" },
-    { "im2col3x3_nhwc", "nhwc/im2col.cl" },
-    { "im2col9x9_nhwc", "nhwc/im2col.cl" },
-    { "im2col_generic_nhwc", "nhwc/im2col.cl" },
-    { "indirect_convolution_nhwc", "nhwc/indirect_convolution.cl" },
-    { "indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl" },
-    { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" },
-    { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" },
-    { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" },
-    { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
-    { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
-    { "pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl" },
-    { "pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl" },
-    { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
-    { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
-    { "scale_bilinear_nhwc", "nhwc/scale.cl" },
-    { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
-    { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
-    { "transposed_convolution_nhwc", "nhwc/transposed_convolution.cl" },
-    { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" },
-    { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" },
-    { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
-    { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" },
-    { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" },
+    {"batch_to_space_nhwc", "nhwc/batch_to_space.cl"},
+    {"batch_to_space_static_nhwc", "nhwc/batch_to_space.cl"},
+    {"batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl"},
+    {"channel_shuffle_nhwc", "nhwc/channel_shuffle.cl"},
+    {"depth_to_space_nhwc", "nhwc/depth_to_space.cl"},
+    {"dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl"},
+    {"dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl"},
+    {"dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl"},
+    {"direct_convolution_nhwc", "nhwc/direct_convolution.cl"},
+    {"direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl"},
+    {"im2col3x3_nhwc", "nhwc/im2col.cl"},
+    {"im2col9x9_nhwc", "nhwc/im2col.cl"},
+    {"im2col_generic_nhwc", "nhwc/im2col.cl"},
+    {"indirect_convolution_nhwc", "nhwc/indirect_convolution.cl"},
+    {"indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl"},
+    {"normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl"},
+    {"normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl"},
+    {"normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl"},
+    {"pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl"},
+    {"pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl"},
+    {"pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl"},
+    {"pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl"},
+    {"reorg_layer_nhwc", "nhwc/reorg_layer.cl"},
+    {"scale_nearest_neighbour_nhwc", "nhwc/scale.cl"},
+    {"scale_bilinear_nhwc", "nhwc/scale.cl"},
+    {"space_to_batch_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_batch_static_nhwc", "nhwc/space_to_batch.cl"},
+    {"space_to_depth_nhwc", "nhwc/space_to_depth.cl"},
+    {"transposed_convolution_nhwc", "nhwc/transposed_convolution.cl"},
+    {"upsample_layer_nhwc", "nhwc/upsample_layer.cl"},
+    {"winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl"},
+    {"winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"},
+    {"winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl"},
+    {"winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl"},
 #endif /* ENABLE_NHWC_KERNELS */
 };
 
-const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
-{
+const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = {
 #ifdef EMBEDDED_KERNELS
     {
         "activation_float_helpers.h",
@@ -996,7 +985,7 @@
     // Find which program contains the kernel
     auto kernel_program_it = _kernel_program_map.find(kernel_name);
 
-    if(_kernel_program_map.end() == kernel_program_it)
+    if (_kernel_program_map.end() == kernel_program_it)
     {
         ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str());
     }
@@ -1022,14 +1011,14 @@
 #ifdef EMBEDDED_KERNELS
 #ifdef ARM_COMPUTE_COMPRESSED_KERNELS
     const auto inflatted_program_source_it = _decompressed_source_map.find(program_name);
-    if(inflatted_program_source_it != _decompressed_source_map.end())
+    if (inflatted_program_source_it != _decompressed_source_map.end())
     {
-        return ClProgramInfo{ inflatted_program_source_it->second, false };
+        return ClProgramInfo{inflatted_program_source_it->second, false};
     }
 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
 
     const auto program_source_it = _program_source_map.find(program_name);
-    if(program_source_it == _program_source_map.end())
+    if (program_source_it == _program_source_map.end())
     {
         ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str());
     }
@@ -1042,7 +1031,7 @@
     program_source = std::move(decompressed_program_source);
 #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */
 
-    return ClProgramInfo{ program_source, false };
+    return ClProgramInfo{program_source, false};
 #else  /* EMBEDDED_KERNELS */
     // Check for binary
     std::string source_name = _kernel_path + program_name;
@@ -1050,12 +1039,12 @@
     std::string program_source{};
     bool        is_binary = false;
 
-    if(std::ifstream(binary_name).is_open())
+    if (std::ifstream(binary_name).is_open())
     {
         program_source = read_file(binary_name, true);
         is_binary      = true;
     }
-    else if(std::ifstream(source_name).is_open())
+    else if (std::ifstream(source_name).is_open())
     {
         program_source = read_file(source_name, false);
     }
@@ -1064,7 +1053,7 @@
         ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str());
     }
 
-    return ClProgramInfo{ program_source, is_binary };
+    return ClProgramInfo{program_source, is_binary};
 #endif /* EMBEDDED_KERNELS */
 }
 } // namespace opencl

diff --git a/src/gpu/cl/ClKernelLibrary.h b/src/gpu/cl/ClKernelLibrary.h
index 42bec95..cd1d689 100644
--- a/src/gpu/cl/ClKernelLibrary.h
+++ b/src/gpu/cl/ClKernelLibrary.h

@@ -52,8 +52,8 @@
     /** Structure to encapsulte program related information */
     struct ClProgramInfo
     {
-        std::string program{};          /**< Program raw string */
-        bool        is_binary{ false }; /**< Flag that indicates if is in binary format */
+        std::string program{};        /**< Program raw string */
+        bool        is_binary{false}; /**< Flag that indicates if is in binary format */
     };
 
 public:
@@ -84,10 +84,12 @@
     std::string program_name(const std::string &kernel_name) const;
 
 private:
-    std::string _kernel_path{};                                                 /**< Path to the kernels folder. */
-    mutable std::map<std::string, std::string>      _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
-    static const std::map<std::string, std::string> _kernel_program_map;        /**< Map that associates kernel names with programs. */
-    static const std::map<std::string, std::string> _program_source_map;        /**< Contains sources for all programs.
+    std::string _kernel_path{}; /**< Path to the kernels folder. */
+    mutable std::map<std::string, std::string>
+        _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */
+    static const std::map<std::string, std::string>
+        _kernel_program_map; /**< Map that associates kernel names with programs. */
+    static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs.
                                                                                      Used for compile-time kernel inclusion. >*/
 };
 } // namespace opencl

diff --git a/src/gpu/cl/ClQueue.cpp b/src/gpu/cl/ClQueue.cpp
index 2123adc..0cb7af5 100644
--- a/src/gpu/cl/ClQueue.cpp
+++ b/src/gpu/cl/ClQueue.cpp

@@ -36,7 +36,7 @@
 {
 CLTunerMode map_tuner_mode(AclTuningMode mode)
 {
-    switch(mode)
+    switch (mode)
     {
         case AclRapid:
             return CLTunerMode::RAPID;
@@ -55,7 +55,7 @@
 
 std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options)
 {
-    if(options == nullptr || options->mode == AclTuningModeNone)
+    if (options == nullptr || options->mode == AclTuningModeNone)
     {
         return nullptr;
     }
@@ -68,8 +68,7 @@
 }
 } // namespace
 
-ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options)
-    : IQueue(ctx), _tuner(nullptr)
+ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx), _tuner(nullptr)
 {
     _tuner = populate_tuner(options);
 }

diff --git a/src/gpu/cl/ClQueue.h b/src/gpu/cl/ClQueue.h
index b16a0f4..09ffb06 100644
--- a/src/gpu/cl/ClQueue.h
+++ b/src/gpu/cl/ClQueue.h

@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLQUEUE_H
 #define SRC_GPU_CLQUEUE_H
 
-#include "src/common/IQueue.h"
-
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/IQueue.h"
+
 #include <memory>
 
 namespace arm_compute

diff --git a/src/gpu/cl/ClTensor.cpp b/src/gpu/cl/ClTensor.cpp
index 0df0781..27422a4 100644
--- a/src/gpu/cl/ClTensor.cpp
+++ b/src/gpu/cl/ClTensor.cpp

@@ -31,8 +31,7 @@
 {
 namespace opencl
 {
-ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc)
-    : ITensorV2(ctx), _legacy_tensor()
+ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor()
 {
     ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::GpuOcl));
     _legacy_tensor = std::make_unique<CLTensor>();
@@ -43,7 +42,7 @@
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:map]: Backing tensor does not exist!");
         return nullptr;
@@ -57,7 +56,7 @@
 {
     ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr);
 
-    if(_legacy_tensor == nullptr)
+    if (_legacy_tensor == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:unmap]: Backing tensor does not exist!");
         return StatusCode::RuntimeError;

diff --git a/src/gpu/cl/ClTensor.h b/src/gpu/cl/ClTensor.h
index 99d228c..70184cd 100644
--- a/src/gpu/cl/ClTensor.h
+++ b/src/gpu/cl/ClTensor.h

@@ -24,10 +24,10 @@
 #ifndef SRC_GPU_CLTENSOR_H
 #define SRC_GPU_CLTENSOR_H
 
-#include "src/common/ITensorV2.h"
-
 #include "arm_compute/runtime/CL/CLTensor.h"
 
+#include "src/common/ITensorV2.h"
+
 namespace arm_compute
 {
 namespace gpu
@@ -54,7 +54,7 @@
     void                 *map() override;
     StatusCode            unmap() override;
     arm_compute::ITensor *tensor() const override;
-    StatusCode import(void *handle, ImportMemoryType type) override;
+    StatusCode            import(void *handle, ImportMemoryType type) override;
 
 private:
     std::unique_ptr<CLTensor> _legacy_tensor;
@@ -63,4 +63,4 @@
 } // namespace gpu
 } // namespace arm_compute
 
-#endif /* SRC_GPU_CLTENSOR_H */
\ No newline at end of file
+#endif /* SRC_GPU_CLTENSOR_H */

diff --git a/src/gpu/cl/IClKernel.h b/src/gpu/cl/IClKernel.h
index 52ea3c9..4f07e9a 100644
--- a/src/gpu/cl/IClKernel.h
+++ b/src/gpu/cl/IClKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_ICL_KERNEL_H
 
 #include "arm_compute/core/ITensorInfo.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute

diff --git a/src/gpu/cl/kernels/ClActivationKernel.cpp b/src/gpu/cl/kernels/ClActivationKernel.cpp
index ab15437..a85296f 100644
--- a/src/gpu/cl/kernels/ClActivationKernel.cpp
+++ b/src/gpu/cl/kernels/ClActivationKernel.cpp

@@ -28,14 +28,14 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 #include <set>
@@ -51,36 +51,47 @@
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16, DataType::F16, DataType::F32);
 
-    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations =
-    {
-        ActivationLayerInfo::ActivationFunction::RELU,
-        ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-        ActivationLayerInfo::ActivationFunction::LOGISTIC,
-        ActivationLayerInfo::ActivationFunction::TANH,
-        ActivationLayerInfo::ActivationFunction::HARD_SWISH,
+    static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations = {
+        ActivationLayerInfo::ActivationFunction::RELU,         ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+        ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC,
+        ActivationLayerInfo::ActivationFunction::TANH,         ActivationLayerInfo::ActivationFunction::HARD_SWISH,
         ActivationLayerInfo::ActivationFunction::LEAKY_RELU,
     };
-    const DataType                                data_type = src->data_type();
-    const QuantizationInfo                       &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
-    const ActivationLayerInfo::ActivationFunction f_act     = act_info.activation();
+    const DataType          data_type = src->data_type();
+    const QuantizationInfo &oq_info   = (dst != nullptr) ? dst->quantization_info() : src->quantization_info();
+    const ActivationLayerInfo::ActivationFunction f_act = act_info.activation();
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0),
-                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) &&
+                                        (quantized_supported_activations.count(f_act) == 0),
+                                    "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and "
+                                    "lower/upper bounded relu are supported");
 
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 128)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, 0)));
 
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 32768.f, 0)));
 
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0)));
-    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::TANH) &&
+                                (oq_info != QuantizationInfo(1.f / 128.f, 0)));
+    ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED &&
+                                (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) &&
+                                (oq_info != QuantizationInfo(1.f / 256.f, -128)));
 
     // Checks performed when destination is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
@@ -95,15 +106,18 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info)
+void ClActivationKernel::configure(const ClCompileContext &compile_context,
+                                   ITensorInfo            *src,
+                                   ITensorInfo            *dst,
+                                   ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _run_in_place = (dst == nullptr) || (dst == src);
 
-    if(dst != nullptr)
+    if (dst != nullptr)
     {
         // Destination auto inizialitation if not yet initialized
         auto_init_if_empty(*dst, *src->clone());
@@ -119,11 +133,10 @@
 
     const ActivationLayerInfo::ActivationFunction f_act        = act_info.activation();
     const bool                                    is_quantized = is_data_type_quantized(dt);
-    const bool                                    perform_activation_in_float =
-        (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC)
-        || (f_act == ActivationLayerInfo::ActivationFunction::TANH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH)
-        || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
+    const bool perform_activation_in_float = (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::TANH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) ||
+                                             (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU);
 
     // Set build options
     CLBuildOptions build_opts;
@@ -132,22 +145,23 @@
     build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
 
     std::string kernel_name = std::string("activation_layer");
 
     // Set quantization info build options
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
 
-        if(!perform_activation_in_float)
+        if (!perform_activation_in_float)
         {
             int a_const_int = 0;
             int b_const_int = 0;
 
             // Create quantized version of constants a, b if needed
-            switch(dt)
+            switch (dt)
             {
                 case DataType::QASYMM8:
                 {
@@ -180,22 +194,25 @@
         }
 
         // Quantized value of 0 corresponds to the offset o1
-        build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
+        build_opts.add_option(
+            ("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0")));
         build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale)));
-        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
+        build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                 "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset));
 
         // Set correct kernel name
         kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant");
 
         // Set scale and offset of the source and destination if they have different quantization info
-        if(dst != nullptr)
+        if (dst != nullptr)
         {
             const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
 
-            if(iq_info != oq_info)
+            if (iq_info != oq_info)
             {
                 build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale)));
-                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
+                build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                                         "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset));
             }
         }
     }
@@ -235,8 +252,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst);
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -246,13 +264,12 @@
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src, slice);
-        if(!_run_in_place)
+        if (!_run_in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClActivationKernel.h b/src/gpu/cl/kernels/ClActivationKernel.h
index 82e35b6..ab7607b 100644
--- a/src/gpu/cl/kernels/ClActivationKernel.h
+++ b/src/gpu/cl/kernels/ClActivationKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -51,7 +52,10 @@
      * @param[out]     dst             Destination tensor info. Data type supported: same as @p src
      * @param[in]      act_info        Activation layer information.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   ActivationLayerInfo     act_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClActivationKernel::configure()
@@ -64,7 +68,7 @@
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    bool _run_in_place{ false };
+    bool _run_in_place{false};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
index 3d8ecf1..a853f6b 100644
--- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp

@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -66,12 +66,15 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst)
+void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            batch_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _batch_offset = batch_offset;
 
@@ -81,8 +84,9 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -136,8 +140,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_3D();
 
@@ -152,9 +157,8 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
-} // namespace opencl
 } // namespace kernels
+} // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
index f6b7c0e..549576b 100644
--- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h

@@ -53,7 +53,8 @@
      * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClBatchConcatenateKernel::configure()
@@ -66,7 +67,7 @@
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    unsigned int _batch_offset{ 0 };
+    unsigned int _batch_offset{0};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp
index f621ad6..9ca3563 100644
--- a/src/gpu/cl/kernels/ClCastKernel.cpp
+++ b/src/gpu/cl/kernels/ClCastKernel.cpp

@@ -32,10 +32,10 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -52,20 +52,17 @@
     ARM_COMPUTE_UNUSED(policy);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32, DataType::S64, DataType::U64);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                         1,
-                                                         DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16,
-                                                         DataType::U16, DataType::U32, DataType::S32, DataType::F16,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
+                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32, DataType::S64, DataType::U64);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8,
+                                                         DataType::S16, DataType::U16, DataType::U32, DataType::S32,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
@@ -79,7 +76,10 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+void ClCastKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             ConvertPolicy           policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -88,7 +88,7 @@
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Get data sizes
     const size_t src_size = data_size_from_type(src->data_type());
@@ -100,12 +100,14 @@
     // Set build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
     // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined
     build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE");
-    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT");
+    build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()),
+                             "-DIS_DATA_TYPE_FLOAT");
     build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED");
 
     // Create kernel
@@ -148,8 +150,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -162,8 +165,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h
index a021b3c..07b0b61 100644
--- a/src/gpu/cl/kernels/ClCastKernel.h
+++ b/src/gpu/cl/kernels/ClCastKernel.h

@@ -64,7 +64,8 @@
      * @param[out] dst             The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy          Conversion policy
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    void
+    configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCastKernel::configure()

diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/gpu/cl/kernels/ClCol2ImKernel.cpp
index 3316742..9972e07 100644
--- a/src/gpu/cl/kernels/ClCol2ImKernel.cpp
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,29 +48,38 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *src,
+                          const ITensorInfo *dst,
+                          const Size2D      &convolved_dims,
+                          unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     // Checks performed when output is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW,
+                                        "Col2Im output's data layout must always be NCHW");
     }
 
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW));
+    auto_init_if_empty(*dst, src->clone()
+                                 ->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups))
+                                 .set_data_layout(DataLayout::NCHW));
 
     constexpr unsigned int num_elems_read_per_iteration = 8;
 
@@ -80,18 +90,22 @@
     AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration);
     bool                   window_changed = update_window_and_padding(win, input_access);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
 
-ClCol2ImKernel::ClCol2ImKernel()
-    : _convolved_dims()
+ClCol2ImKernel::ClCol2ImKernel() : _convolved_dims()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+void ClCol2ImKernel::configure(const CLCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &convolved_dims,
+                               unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -132,11 +146,15 @@
     _config_id += support::cpp11::to_string(dst->dimension(1));
 }
 
-Status ClCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups)
+Status ClCol2ImKernel::validate(const ITensorInfo *src,
+                                const ITensorInfo *dst,
+                                const Size2D      &convolved_dims,
+                                unsigned int       num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first);
     return Status{};
 }
 
@@ -168,8 +186,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
+    } while (collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.h b/src/gpu/cl/kernels/ClCol2ImKernel.h
index e19b7c8..34194ab 100644
--- a/src/gpu/cl/kernels/ClCol2ImKernel.h
+++ b/src/gpu/cl/kernels/ClCol2ImKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_COL2IM_KERNEL_H
 
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -68,14 +69,19 @@
      * @param[in]  convolved_dims  Output convolved dimensions.
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &convolved_dims,
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClCol2ImKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
index 716dec1..85d3c39 100644
--- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -45,17 +46,21 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                     DataLayout data_layout)
+void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context,
+                                                     const ITensorInfo      *src,
+                                                     ITensorInfo            *dst,
+                                                     const TensorShape      &original_src_shape,
+                                                     DataLayout              data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialisation if not yet initialized
     auto_init_if_empty(*dst, *src->clone());
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
-    ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout));
 
     const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW;
 
@@ -85,8 +90,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape,
-                                                      DataLayout data_layout)
+Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src,
+                                                      const ITensorInfo *dst,
+                                                      const TensorShape &original_src_shape,
+                                                      DataLayout         data_layout)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
@@ -96,7 +103,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN);
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -110,8 +117,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     unsigned int idx = 0;

diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
index 16000e8..0ddb545 100644
--- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h
+++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h

@@ -55,14 +55,21 @@
      * @param[in]  original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in]  data_layout        The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClCopyKernel.cpp b/src/gpu/cl/kernels/ClCopyKernel.cpp
index 4719448..c80ef66 100644
--- a/src/gpu/cl/kernels/ClCopyKernel.cpp
+++ b/src/gpu/cl/kernels/ClCopyKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,11 +51,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
 
     // Validate dst if initialized
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
-        if(dst_window == nullptr)
+        if (dst_window == nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape());
         }
@@ -74,12 +75,15 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window)
+void ClCopyKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
     CLBuildOptions build_opts;
@@ -93,7 +97,7 @@
 
     const Window win_config = calculate_max_window(*src, Steps(vec_size_x));
 
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         _has_dst_window                = true;
         _dst_window                    = Window(*dst_window);
@@ -101,9 +105,11 @@
         const int  vec_size_x_leftover = width_x % vec_size_x;
         const bool multi_access_x      = width_x >= static_cast<int32_t>(vec_size_x);
 
-        if(multi_access_x)
+        if (multi_access_x)
         {
-            _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
+            _dst_window.set(Window::DimX,
+                            Window::Dimension(dst_window->x().start(),
+                                              ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x));
         }
 
         build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover));
@@ -127,7 +133,8 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
+Status
+ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window));
 
@@ -139,12 +146,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice;
 
-    if(_has_dst_window)
+    if (_has_dst_window)
     {
         slice            = window.first_slice_window_3D();
         Window out_slice = _dst_window.first_slice_window_3D();
@@ -154,8 +162,7 @@
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, out_slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
+        } while (window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice));
     }
     else
     {
@@ -167,8 +174,7 @@
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(collapsed.slide_window_slice_3D(slice));
+        } while (collapsed.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/ClCopyKernel.h b/src/gpu/cl/kernels/ClCopyKernel.h
index 63fd806..f915bf6 100644
--- a/src/gpu/cl/kernels/ClCopyKernel.h
+++ b/src/gpu/cl/kernels/ClCopyKernel.h

@@ -47,7 +47,10 @@
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCopyKernel::configure()

diff --git a/src/gpu/cl/kernels/ClCropKernel.cpp b/src/gpu/cl/kernels/ClCropKernel.cpp
index 87ad6b4..0c503e1 100644
--- a/src/gpu/cl/kernels/ClCropKernel.cpp
+++ b/src/gpu/cl/kernels/ClCropKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -46,8 +47,14 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index,
-                             float extrapolation_value, Window *dst_window)
+void ClCropKernel::configure(const CLCompileContext &compile_context,
+                             const ITensorInfo      *src,
+                             ITensorInfo            *dst,
+                             Coordinates2D           start,
+                             Coordinates2D           end,
+                             uint32_t                batch_index,
+                             float                   extrapolation_value,
+                             Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window));
@@ -60,7 +67,7 @@
     // Create and update the window (if needed)
     Window win = calculate_max_window(*dst);
 
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window);
         win = *dst_window;
@@ -70,7 +77,7 @@
     const bool     multi_access_x = dst_width_x >= vec_size_x;
     const bool     remainder_x    = dst_width_x % vec_size_x > 0;
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
         win.set(Window::DimX,
                 Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
@@ -81,13 +88,21 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
     build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED=");
     build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED=");
     _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options());
 }
 
-Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status ClCropKernel::validate(const ITensorInfo *src,
+                              const ITensorInfo *dst,
+                              Coordinates2D      start,
+                              Coordinates2D      end,
+                              uint32_t           batch_index,
+                              float              extrapolation_value,
+                              Window            *dst_window)
 {
     ARM_COMPUTE_UNUSED(extrapolation_value, dst_window);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
@@ -95,14 +110,15 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4);
     ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2))
-                                || end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2)) ||
+        end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2)));
     ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3));
-    if(dst_window != nullptr)
+    if (dst_window != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1);
     }
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -116,12 +132,15 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window in_slice = Window();
     in_slice.use_tensor_dimensions(src->info()->tensor_shape());
-    in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step()));
+    in_slice.set(Window::DimX,
+                 Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()),
+                                   window.x().step()));
     in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1));
 
     unsigned int idx = 0;

diff --git a/src/gpu/cl/kernels/ClCropKernel.h b/src/gpu/cl/kernels/ClCropKernel.h
index 2f166e1..5062626 100644
--- a/src/gpu/cl/kernels/ClCropKernel.h
+++ b/src/gpu/cl/kernels/ClCropKernel.h

@@ -53,16 +53,27 @@
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCropKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
index a05cd13..ec44d88 100644
--- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp

@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -48,7 +48,8 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
 
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX));
@@ -60,18 +61,20 @@
 }
 } // namespace
 
-ClDepthConcatenateKernel::ClDepthConcatenateKernel()
-    : _depth_offset(0)
+ClDepthConcatenateKernel::ClDepthConcatenateKernel() : _depth_offset(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst)
+void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            depth_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _depth_offset = depth_offset;
 
@@ -81,8 +84,9 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -122,8 +126,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_3D();
 
@@ -138,8 +143,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
index 4739677..539f010 100644
--- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h

@@ -53,7 +53,8 @@
      * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClDepthConcatenateKernel::configure()

diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/gpu/cl/kernels/ClDequantizeKernel.cpp
index 756cd56..53429ab 100644
--- a/src/gpu/cl/kernels/ClDequantizeKernel.cpp
+++ b/src/gpu/cl/kernels/ClDequantizeKernel.cpp

@@ -34,7 +34,6 @@
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -49,9 +48,11 @@
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8,
+                                                         DataType::QSYMM16);
 
-    if(dst->tensor_shape().total_size() > 0)
+    if (dst->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32);
@@ -74,7 +75,7 @@
     // Output tensor auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
@@ -87,7 +88,7 @@
 
     // Create kernel
     CLBuildOptions build_opts;
-    if(!is_quantized_per_channel)
+    if (!is_quantized_per_channel)
     {
         const UniformQuantizationInfo qinfo   = src->quantization_info().uniform();
         const int                     qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0;
@@ -103,16 +104,18 @@
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(
+                                                                        std::max<int>(output_width_x - vec_size_x, 0)));
 
     // Create kernel name
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*dst);
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -136,10 +139,11 @@
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type());
 
     // Collapse windo
-    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3);
+    Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4)
+                                                 : window.collapse_if_possible(ICLKernel::window(), 3);
     Window slice      = new_window.first_slice_window_3D();
 
-    if(is_quantized_per_channel)
+    if (is_quantized_per_channel)
     {
         unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters
         _kernel.setArg(idx++, src->quantization().scale->cl_buffer());
@@ -151,8 +155,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(new_window.slide_window_slice_3D(slice));
+    } while (new_window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
index 7ad3984..7cf1958 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp

@@ -23,17 +23,18 @@
  */
 #include "src/gpu/cl/kernels/ClDirectConv2dKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/PixelValue.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
@@ -51,11 +52,17 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
 
     const DataLayout data_layout = src->data_layout();
@@ -63,41 +70,56 @@
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, "Export to CLImage is not supported for the input tensor");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, "Export to CLImage is not supported for the output tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true,
+                                    "Export to CLImage is not supported for the input tensor");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true,
+                                    "Export to CLImage is not supported for the output tensor");
 
-    if(data_layout == DataLayout::NCHW)
+    if (data_layout == DataLayout::NCHW)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) && std::get<0>(conv_info.stride()) > 2,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx),
+                                        "Weights should have same width and height");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3,
+                                        "Strides larger than 3 not supported for 1x1 convolution.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 ||
+                                         weights->dimension(width_idx) == 9) &&
+                                            std::get<0>(conv_info.stride()) > 2,
                                         "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled(), "Fused activation is not supported for NCHW layout");
 
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
-                                            "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
+                "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
         }
         else
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
-                                            "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 &&
+                    weights->dimension(width_idx) != 5,
+                "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
         }
     }
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()), "Fused activation in NHWC is only supported for floating point.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()),
+                                        "Fused activation in NHWC is only supported for floating point.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                        "M0 can only be greater than 0 and less than or equal to 8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                            desc.n0 != 16,
                                         "N0 can only be: 1, 2, 3, 4, 8, and 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                            desc.k0 != 16,
                                         "K0 can only be: 1, 2, 3, 4, 8, and 16");
-        if(desc.export_weights_to_cl_image)
+        if (desc.export_weights_to_cl_image)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
                                             "K0 can only be: 4, 8, and 16");
@@ -106,9 +128,9 @@
         }
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -118,20 +140,19 @@
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
     const auto data_type = src->data_type();
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -140,7 +161,8 @@
         float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
     return Status{};
 }
@@ -151,8 +173,14 @@
     _type = CLKernelType::DIRECT;
 }
 
-void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                     const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+void ClDirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                     ITensorInfo                       *src,
+                                     ITensorInfo                       *weights,
+                                     ITensorInfo                       *biases,
+                                     ITensorInfo                       *dst,
+                                     const PadStrideInfo               &conv_info,
+                                     const ActivationLayerInfo         &act_info,
+                                     const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
@@ -178,14 +206,11 @@
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
     // Configure kernel window
     Window win;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         output_shape.collapse(2U, 1U);
         const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]);
@@ -194,7 +219,7 @@
         // Create window and update padding
         win = calculate_max_window(output_shape, Steps(n0, m0));
     }
-    else if(_data_layout == DataLayout::NCHW)
+    else if (_data_layout == DataLayout::NCHW)
     {
         _num_elems_processed_per_iteration = 1u;
         win                                = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
@@ -205,7 +230,7 @@
     std::stringstream kernel_name;
     CLBuildOptions    build_options;
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         kernel_name << "direct_convolution_nhwc";
 
@@ -221,22 +246,22 @@
         _export_output_to_cl_image  = desc.export_output_to_cl_image;
 
         // Update the padding for the weights tensor if we can export to cl_image
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             gemm::update_padding_for_cl_image(weights);
         }
 
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             gemm::update_padding_for_cl_image(dst);
         }
 
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             gemm::update_padding_for_cl_image(src);
         }
 
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             build_options.add_option(std::string("-DHAS_BIAS"));
             build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -246,9 +271,10 @@
         const auto act_function  = act_info.activation();
         const auto dst_data_type = dst->data_type();
 
-        if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-           && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-           && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
+        if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+            (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+             act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+            (dst_data_type == DataType::F32 || dst_data_type == DataType::F16))
         {
             // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
             // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -259,7 +285,8 @@
             build_options.add_option("-cl-fast-relaxed-math");
         }
 
-        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE",
+                                         "-DSRC_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0)));
         build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1)));
@@ -267,9 +294,11 @@
         build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0)));
         build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1)));
         build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2)));
-        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", "-DDST_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE",
+                                         "-DDST_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type));
-        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER");
+        build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE",
+                                         "-DWEI_TENSOR_TYPE=BUFFER");
         build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx)));
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type()));
@@ -284,7 +313,7 @@
         build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP");
         build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function)));
 
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
             const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -314,11 +343,13 @@
             build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
-            build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
-            build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
+            build_options.add_option_if(act_info.enabled(),
+                                        "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
         }
 
-        if(compile_context.get_ddk_version() >= 30)
+        if (compile_context.get_ddk_version() >= 30)
         {
             build_options.add_option("-fregister-allocation=64");
         }
@@ -340,13 +371,17 @@
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx)));
         build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
         build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
-        build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
+        build_options.add_option(
+            std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx))));
         build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)));
         build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));
-        build_options.add_option(std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
-        build_options.add_option(std::string("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)));
+        build_options.add_option(
+            std::string("-DVEC_SIZE_LEFTOVER=" +
+                        support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)));
 
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
             const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -405,8 +440,13 @@
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                      const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status ClDirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                      const ITensorInfo                 *weights,
+                                      const ITensorInfo                 *biases,
+                                      const ITensorInfo                 *dst,
+                                      const PadStrideInfo               &conv_info,
+                                      const ActivationLayerInfo         &act_info,
+                                      const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc));
     return Status{};
@@ -420,52 +460,55 @@
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         cl::Image2D weights_cl_image;
         cl::Image2D output_cl_image;
         cl::Image2D input_cl_image;
 
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             // Export tensor to cl_image
             weights_cl_image = create_image2d_from_tensor(weights, CLImage2DType::ReadOnly);
         }
 
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             // Export tensor to cl_image
             output_cl_image = create_image2d_from_tensor(dst, CLImage2DType::WriteOnly);
         }
 
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             // Export tensor to cl_image
             input_cl_image = create_image2d_from_tensor(src, CLImage2DType::ReadOnly);
         }
 
         unsigned int idx = 0;
-        if(_export_input_to_cl_image)
+        if (_export_input_to_cl_image)
         {
             _kernel.setArg(idx++, input_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, src);
-        if(_export_output_to_cl_image)
+        if (_export_output_to_cl_image)
         {
             _kernel.setArg(idx++, output_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, dst);
-        if(_export_weights_to_cl_image)
+        if (_export_weights_to_cl_image)
         {
             _kernel.setArg(idx++, weights_cl_image);
         }
         add_4d_tensor_nhwc_argument(idx, weights);
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, slice);
         }
@@ -476,7 +519,7 @@
         unsigned int idx1 = 2 * num_arguments_per_3D_tensor();
         add_3D_tensor_argument(idx1, weights, slice);
 
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             Window slice_biases;
             slice_biases.use_tensor_dimensions(biases->info()->tensor_shape());
@@ -491,8 +534,7 @@
             add_3D_tensor_argument(idx, src, slice);
             add_3D_tensor_argument(idx, dst, slice);
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window.slide_window_slice_3D(slice));
+        } while (window.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h
index 7132762..c934c82 100644
--- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -68,16 +69,27 @@
      * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct convolution kernel. For NCHW, this parameter is ignored.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClDirectConv2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -85,9 +97,9 @@
 public:
     DataLayout    _data_layout{};
     PadStrideInfo _conv_info{};
-    bool          _export_weights_to_cl_image{ false };
-    bool          _export_output_to_cl_image{ false };
-    bool          _export_input_to_cl_image{ false };
+    bool          _export_weights_to_cl_image{false};
+    bool          _export_output_to_cl_image{false};
+    bool          _export_input_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
index 6191178..8002520 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -40,7 +41,11 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status validate_arguments(const ITensorInfo *src0,
+                          const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
@@ -49,20 +54,25 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv3d_info.act_info.enabled(), "Fused activation not supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     ARM_COMPUTE_RETURN_ERROR_ON(conv3d_info.dilation != Size3D(1U, 1U, 1U));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 5, "Weights can be at most 5 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) > (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) > (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) > (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) >
+                                (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) >
+                                (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) >
+                                (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
-        if(is_data_type_quantized(src0->data_type()))
+        if (is_data_type_quantized(src0->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32);
         }
@@ -70,15 +80,18 @@
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0),
+                                        "Biases size and number of dst feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src1->dimension(0), "Weights and dst OFMs should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
 
@@ -91,8 +104,12 @@
     _type = CLKernelType::DIRECT;
 }
 
-void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst,
-                                     const Conv3dInfo &conv3d_info)
+void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context,
+                                     const ITensorInfo      *src0,
+                                     const ITensorInfo      *src1,
+                                     const ITensorInfo      *src2,
+                                     ITensorInfo            *dst,
+                                     const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -149,13 +166,13 @@
     build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
     build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(src2->data_type())));
     }
 
-    if(is_data_type_quantized(data_type))
+    if (is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = src0->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = src1->quantization_info().uniform();
@@ -218,7 +235,11 @@
     _config_id += support::cpp11::to_string(dst_channels);
 }
 
-Status ClDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status ClDirectConv3dKernel::validate(const ITensorInfo *src0,
+                                      const ITensorInfo *src1,
+                                      const ITensorInfo *src2,
+                                      const ITensorInfo *dst,
+                                      const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv3d_info));
     return Status{};
@@ -229,21 +250,28 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Get initial windows
     Window slice = window.first_slice_window_3D();
-    slice.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) * dst->info()->dimension(3), slice.y().step()), slice.y().step()));
+    slice.set(Window::DimY, Window::Dimension(0,
+                                              ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) *
+                                                                   dst->info()->dimension(3),
+                                                               slice.y().step()),
+                                              slice.y().step()));
     slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(4), 1));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, slice);
     add_4D_tensor_argument(idx, dst, slice);
     add_4D_tensor_argument(idx, weights, slice);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }

diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.h b/src/gpu/cl/kernels/ClDirectConv3dKernel.h
index de4f0ce..cb7509d 100644
--- a/src/gpu/cl/kernels/ClDirectConv3dKernel.h
+++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.h

@@ -70,14 +70,23 @@
      * @param[out] dst             Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts.
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClDirectConv3dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/gpu/cl/kernels/ClElementwiseKernel.cpp
index 6beee57..cdb3527 100644
--- a/src/gpu/cl/kernels/ClElementwiseKernel.cpp
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.cpp

@@ -23,18 +23,20 @@
  */
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/common/utils/Validate.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 #include "support/StringSupport.h"
+
 #include <map>
 
 namespace arm_compute
@@ -47,25 +49,20 @@
 {
 constexpr unsigned int vector_size_byte_opencl = 16;
 
-std::map<ArithmeticOperation, std::string> supported_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
-    { ArithmeticOperation::DIV, "DIV" },
-    { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" },
-    { ArithmeticOperation::MIN, "MIN" },
-    { ArithmeticOperation::MAX, "MAX" },
-    { ArithmeticOperation::POWER, "POWER" },
-    { ArithmeticOperation::PRELU, "PRELU" },
+std::map<ArithmeticOperation, std::string> supported_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},     {ArithmeticOperation::SUB, "SUB"},
+    {ArithmeticOperation::DIV, "DIV"},     {ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF"},
+    {ArithmeticOperation::MIN, "MIN"},     {ArithmeticOperation::MAX, "MAX"},
+    {ArithmeticOperation::POWER, "POWER"}, {ArithmeticOperation::PRELU, "PRELU"},
 };
 
-std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops =
-{
-    { ArithmeticOperation::ADD, "ADD" },
-    { ArithmeticOperation::SUB, "SUB" },
+std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops = {
+    {ArithmeticOperation::ADD, "ADD"},
+    {ArithmeticOperation::SUB, "SUB"},
 };
 
-std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string
+generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
 {
     std::string config_id;
     // Set config_id for enabling LWS tuning
@@ -79,12 +76,18 @@
     return config_id;
 }
 
-Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape)
+Status validate_in_place_output_shape(const bool         in_place,
+                                      const bool         src1_in_place,
+                                      const ITensorInfo &src1,
+                                      const ITensorInfo &src2,
+                                      const ITensorInfo &dst,
+                                      const TensorShape &out_shape)
 {
-    if(in_place)
+    if (in_place)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
-                                        "Wrong shape for dst, cannot do in_place calculation");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0),
+            "Wrong shape for dst, cannot do in_place calculation");
     }
     else
     {
@@ -94,7 +97,9 @@
     return Status{};
 }
 
-Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1,
+                                                          const ITensorInfo &src2,
+                                                          const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
@@ -110,11 +115,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
     }
 
     return Status{};
@@ -136,25 +142,27 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape));
     }
 
     return Status{};
 }
 
-Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+Status
+validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                         DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2);
 
-    if(is_data_type_quantized_symmetric(src1.data_type()))
+    if (is_data_type_quantized_symmetric(src1.data_type()))
     {
         const int32_t in1_offset = src1.quantization_info().uniform().offset;
         const int32_t in2_offset = src2.quantization_info().uniform().offset;
@@ -170,13 +178,15 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst");
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0),
+                                        "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape));
 
-        if(is_data_type_quantized_symmetric(dst.data_type()))
+        if (is_data_type_quantized_symmetric(dst.data_type()))
         {
             const int32_t offset = dst.quantization_info().uniform().offset;
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero");
@@ -185,19 +195,26 @@
     return Status{};
 }
 
-CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string)
+CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1,
+                                                            const ITensorInfo &src2,
+                                                            const ITensorInfo &dst,
+                                                            const std::string &operation_string)
 {
     CLBuildOptions build_opts;
 
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
 
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN1=" +
+                          support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_IN2=" +
+                          support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DOP=" + operation_string);
-    if(is_data_type_quantized(src1.data_type()))
+    if (is_data_type_quantized(src1.data_type()))
     {
         const UniformQuantizationInfo iq1info = src1.quantization_info().uniform();
         const UniformQuantizationInfo iq2info = src2.quantization_info().uniform();
@@ -223,14 +240,17 @@
 
 std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst)
 {
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
-    Window             win                               = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0));
+    Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration));
     return std::make_pair(Status{}, win);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     auto_init_if_empty(dst, out_shape, 1, src1.data_type());
@@ -238,9 +258,11 @@
     return configure_window_arithmetic_common(dst);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     set_shape_if_empty(dst, out_shape);
@@ -249,9 +271,11 @@
     return configure_window_arithmetic_common(dst);
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
+    const std::pair<TensorShape, ValidRegion> broadcast_pair =
+        ITensorInfo::broadcast_shape_and_valid_region(src1, src2);
     const TensorShape &out_shape = broadcast_pair.first;
 
     auto_init_if_empty(dst, out_shape, 1, src1.data_type());
@@ -265,21 +289,24 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context,
+                                           ITensorInfo            *src1,
+                                           ITensorInfo            *src2,
+                                           ITensorInfo            *dst)
 {
     // Configure kernel window
     auto win_config = validate_and_configure_window(*src1, *src2, *dst);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
 
     std::string kernel_name = "elementwise_operation_" + name();
-    if(is_data_type_quantized(src1->data_type()))
+    if (is_data_type_quantized(src1->data_type()))
     {
         kernel_name += "_quantized";
     }
 
     // Set kernel build options
     CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst);
-    if(_act_info.enabled())
+    if (_act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation())));
         build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a()));
@@ -299,9 +326,11 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
 
@@ -311,17 +340,18 @@
 
     bool       can_collapse = true;
     const bool is_vector    = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -337,7 +367,7 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src_0, slice_src1);
         add_3D_tensor_argument(idx, src_1, slice_src2);
-        if(!in_place)
+        if (!in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
@@ -345,13 +375,16 @@
         enqueue(queue, *this, slice, lws_hint());
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 /** Logical binary */
 
-void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context,
+                                      LogicalOperation        op,
+                                      ITensorInfo            *src1,
+                                      ITensorInfo            *src2,
+                                      ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst));
@@ -359,7 +392,10 @@
     configure_common(compile_context, src1, src2, dst);
 }
 
-Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+Status ClLogicalBinaryKernel::validate(LogicalOperation   op,
+                                       const ITensorInfo *src1,
+                                       const ITensorInfo *src2,
+                                       const ITensorInfo *dst)
 {
     ARM_COMPUTE_UNUSED(op);
     ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not);
@@ -369,14 +405,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2);
 
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone())
+            .first);
 
     return Status{};
 }
 
 std::string ClLogicalBinaryKernel::name()
 {
-    switch(_op)
+    switch (_op)
     {
         case LogicalOperation::And:
             return "AND";
@@ -390,30 +428,38 @@
     return "";
 }
 
-std::pair<Status, Window> ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
     return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst);
 }
 
-CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+CLBuildOptions
+ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     // The arithmetic utility functions can be share
     return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
 }
 
-std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                          const ITensorInfo &src1,
+                                                          const ITensorInfo &dst)
 {
     return generate_id_for_tuning_common(kernel_name, src1, dst);
 }
 
 /** Arithmetic operations with saturation*/
-void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output,
+void ClSaturatedArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                            ArithmeticOperation        op,
+                                            ITensorInfo               *input1,
+                                            ITensorInfo               *input2,
+                                            ITensorInfo               *output,
                                             const ConvertPolicy       &policy,
                                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info));
-    auto padding_info = get_padding_info({ input1, input2, output });
+    auto padding_info = get_padding_info({input1, input2, output});
 
     _policy   = policy;
     _op       = op;
@@ -422,24 +468,34 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
+Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation        op,
+                                             const ITensorInfo         *input1,
+                                             const ITensorInfo         *input2,
+                                             const ITensorInfo         *output,
+                                             const ConvertPolicy       &policy,
                                              const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(op, policy);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone())
+            .first);
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type()));
 
     return Status{};
 }
 
-std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output)
+std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1,
+                                                                                     ITensorInfo &input2,
+                                                                                     ITensorInfo &output)
 {
     return validate_and_configure_window_for_arithmetic_operators(input1, input2, output);
 }
 
-CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output)
+CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1,
+                                                                   const ITensorInfo &input2,
+                                                                   const ITensorInfo &output)
 {
     const bool has_float_out = is_data_type_float(output.data_type());
     auto       build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name());
@@ -447,7 +503,9 @@
     return build_options;
 }
 
-std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output)
+std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                                const ITensorInfo &input1,
+                                                                const ITensorInfo &output)
 {
     auto config_id = generate_id_for_tuning_common(kernel_name, input1, output);
     config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_";
@@ -461,12 +519,16 @@
 }
 
 /** Arithmetic operations*/
-void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+void ClArithmeticKernel::configure(const ClCompileContext    &compile_context,
+                                   ArithmeticOperation        op,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info));
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     _op       = op;
     _act_info = act_info;
@@ -474,33 +536,42 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClArithmeticKernel::validate(ArithmeticOperation        op,
+                                    const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
-    if(op == ArithmeticOperation::DIV)
+    if (op == ArithmeticOperation::DIV)
     {
         // Partial integer support S32/F32/F16
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
     }
-    else if(op == ArithmeticOperation::POWER)
+    else if (op == ArithmeticOperation::POWER)
     {
         // Power operators doesn't support integer arithmetic
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first);
     }
     else
     {
         ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst));
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone())
+                .first);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
     return Status{};
 }
-std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
+std::pair<Status, Window>
+ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst)
 {
-    if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
+    if (_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER)
     {
         // Division and Power operators don't support integer arithmetic
         return validate_and_configure_window_for_division(src1, src2, dst);
@@ -511,11 +582,14 @@
     }
 }
 
-CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
+CLBuildOptions
+ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst)
 {
     return generate_build_options_with_arithmetic_rules(src1, src2, dst, name());
 }
-std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst)
+std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name,
+                                                       const ITensorInfo &src1,
+                                                       const ITensorInfo &dst)
 {
     return generate_id_for_tuning_common(kernel_name, src1, dst);
 }

diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.h b/src/gpu/cl/kernels/ClElementwiseKernel.h
index ea3ddb2..73e5454 100644
--- a/src/gpu/cl/kernels/ClElementwiseKernel.h
+++ b/src/gpu/cl/kernels/ClElementwiseKernel.h

@@ -25,8 +25,9 @@
 #define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
-#include "src/core/KernelTypes.h"
+
 #include "src/core/common/Macros.h"
+#include "src/core/KernelTypes.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
@@ -65,24 +66,28 @@
      *
      * @return a pair of Status and Window
      */
-    virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
+    virtual std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0;
 
     /** Generate the build options for the specific kernel
      *
      * @reutrn a CLBuildOptions struct
      */
-    virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
+    virtual CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0;
 
     /** Generate the identifier for tuning
      *
      * @reutrn a string
      */
-    virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
+    virtual std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0;
 
     /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff)
      *
      */
-    void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    void
+    configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
 
     ActivationLayerInfo _act_info{};
 };
@@ -100,23 +105,31 @@
      * @param[in] src2            Second source tensor info. Data types supported: same as @p src1.
      * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
      */
-    void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst);
+    void configure(const ClCompileContext &compile_context,
+                   LogicalOperation        op,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogicalBinaryKernel::configure()
      *
      * @return a status
      */
-    static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
+    static Status
+    validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst);
 
 private:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
 
-    LogicalOperation _op{ LogicalOperation::Unknown };
+    LogicalOperation _op{LogicalOperation::Unknown};
 };
 
 /** Addition operation */
@@ -135,7 +148,12 @@
      * @param[in] policy          Policy to use to handle overflow.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *input1,
+                   ITensorInfo               *input2,
+                   ITensorInfo               *output,
+                   const ConvertPolicy       &policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration
@@ -144,15 +162,23 @@
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy,
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *input1,
+                           const ITensorInfo         *input2,
+                           const ITensorInfo         *output,
+                           const ConvertPolicy       &policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
 protected:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override;
+    std::string generate_id_for_tuning(const std::string &kernel_name,
+                                       const ITensorInfo &input1,
+                                       const ITensorInfo &output) override;
 
 private:
     ConvertPolicy       _policy{};
@@ -174,7 +200,11 @@
      * @param[in] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in] act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
+    void configure(const ClCompileContext    &compile_context,
+                   ArithmeticOperation        op,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     /** Static function to check if given info will lead to a valid configuration
@@ -183,14 +213,21 @@
      *
      * @return a status
      */
-    static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(ArithmeticOperation        op,
+                           const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
 protected:
     // Inherited methods overridden:
     std::string name() override;
-    std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
-    CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
-    std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
+    std::pair<Status, Window>
+    validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override;
+    CLBuildOptions
+    generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override;
+    std::string
+    generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override;
 
 private:
     ArithmeticOperation _op{};

diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
index 744a3a4..f7c198e 100644
--- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp

@@ -23,11 +23,12 @@
  */
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -46,17 +47,18 @@
 Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    if(op == ElementWiseUnary::LOGICAL_NOT)
+    if (op == ElementWiseUnary::LOGICAL_NOT)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8);
     }
-    else if(op == ElementWiseUnary::NEG)
+    else if (op == ElementWiseUnary::NEG)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
     }
-    else if(op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
+    else if (op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED);
     }
     else
     {
@@ -64,7 +66,7 @@
     }
 
     // Validate in case of configured dst
-    if(dst.total_size() > 0)
+    if (dst.total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
@@ -80,19 +82,23 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op)
+void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context,
+                                         const ITensorInfo      *src,
+                                         ITensorInfo            *dst,
+                                         const ElementWiseUnary &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op));
-    const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
+    const unsigned int num_elems_processed_per_iteration =
+        adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
 
-    std::string kernel_name    = "elementwise_unary";
-    const int   vec_size_x     = num_elems_processed_per_iteration;
-    const int   dst_width_x    = dst->dimension(0);
-    if(is_data_type_quantized(src->data_type()))
+    std::string kernel_name = "elementwise_unary";
+    const int   vec_size_x  = num_elems_processed_per_iteration;
+    const int   dst_width_x = dst->dimension(0);
+    if (is_data_type_quantized(src->data_type()))
     {
         kernel_name += "_quantized";
     }
@@ -101,7 +107,7 @@
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
-    if(is_data_type_quantized(src->data_type()))
+    if (is_data_type_quantized(src->data_type()))
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
@@ -110,7 +116,7 @@
         build_opts.add_option("-DSCALE_IN=" + float_to_string_with_full_precision(iqinfo.scale));
         build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
     }
-    switch(op)
+    switch (op)
     {
         case ElementWiseUnary::RSQRT:
             build_opts.add_option("-DOPERATION=rsqrt_op");
@@ -169,8 +175,9 @@
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     do
     {
@@ -178,8 +185,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
index 0f270f2..81721f8 100644
--- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h

@@ -47,7 +47,10 @@
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  op              Element wise unary operation to perform.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const ElementWiseUnary &op);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementWiseUnaryKernel::configure()

diff --git a/src/gpu/cl/kernels/ClFillKernel.cpp b/src/gpu/cl/kernels/ClFillKernel.cpp
index a9345ee..96ad503 100644
--- a/src/gpu/cl/kernels/ClFillKernel.cpp
+++ b/src/gpu/cl/kernels/ClFillKernel.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -47,9 +48,10 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor,
-                             const PixelValue &constant_value,
-                             Window           *window)
+void ClFillKernel::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *tensor,
+                             const PixelValue       &constant_value,
+                             Window                 *window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
     ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window));
@@ -60,7 +62,7 @@
     // Create and update the window (if needed)
     _full_window = calculate_max_window(*tensor);
     Window win   = _full_window;
-    if(window != nullptr)
+    if (window != nullptr)
     {
         ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window);
         win = *window;
@@ -70,9 +72,10 @@
     const bool multi_access_x = output_width_x >= vec_size_x;
     const bool remainder_x    = output_width_x % vec_size_x > 0;
 
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -81,7 +84,9 @@
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
     build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type));
     build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
-    build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
+    build_opts.add_option_if(multi_access_x && remainder_x,
+                             "-DLAST_ACCESSED_X=" +
+                                 support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0)));
     _kernel = create_kernel(compile_context, "memset", build_opts.options());
 }
 
@@ -89,7 +94,7 @@
 {
     ARM_COMPUTE_UNUSED(tensor);
     ARM_COMPUTE_UNUSED(constant_value);
-    if(window != nullptr)
+    if (window != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1);
     }
@@ -101,7 +106,8 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto tensor =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
 
     // Collapse all the batches on the third
     Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ);
@@ -112,8 +118,7 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, tensor, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClFillKernel.h b/src/gpu/cl/kernels/ClFillKernel.h
index f25cf92..5d69fbf 100644
--- a/src/gpu/cl/kernels/ClFillKernel.h
+++ b/src/gpu/cl/kernels/ClFillKernel.h

@@ -47,7 +47,10 @@
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClFillKernel::configure()

diff --git a/src/gpu/cl/kernels/ClFloorKernel.cpp b/src/gpu/cl/kernels/ClFloorKernel.cpp
index f9f8348..358e840 100644
--- a/src/gpu/cl/kernels/ClFloorKernel.cpp
+++ b/src/gpu/cl/kernels/ClFloorKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -52,7 +53,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
 
     // Validate in case of configured output
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -76,9 +77,9 @@
 
     // Validate
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
+    const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
     const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
     CLBuildOptions     build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
@@ -105,8 +106,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -117,8 +119,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
index accafee..e0d925d 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp

@@ -29,14 +29,13 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -50,26 +49,35 @@
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                          const GEMMReshapeInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
+    if (src0->data_type() == DataType::QASYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
@@ -83,7 +91,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n));
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d())
+    if (gemm_info.reinterpret_input_as_3d())
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -92,9 +100,10 @@
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
     }
@@ -102,8 +111,13 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
@@ -115,17 +129,19 @@
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_dst_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_dst_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_dst_as_3d)
     {
         reinterpret_dst_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(*dst, src0->clone()
+                                 ->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))
+                                 .set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_dst_as_3d)
+    if (reinterpret_dst_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -138,12 +154,12 @@
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // RHS matrix still needs padding on the X
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
 
     window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop
 
@@ -153,7 +169,8 @@
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -163,8 +180,13 @@
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                                                     const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                     const ITensorInfo       *src0,
+                                                     ITensorInfo             *src1,
+                                                     ITensorInfo             *dst,
+                                                     const GEMMLHSMatrixInfo &lhs_info,
+                                                     const GEMMRHSMatrixInfo &rhs_info,
+                                                     const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
@@ -175,11 +197,11 @@
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
 
     // We still need padding on the X dimension for the RHS matrix
-    auto padding_info = get_padding_info({ src0, dst });
+    auto padding_info = get_padding_info({src0, dst});
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -192,7 +214,8 @@
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -212,8 +235,10 @@
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1)));
@@ -258,19 +283,19 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                      const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                      const ITensorInfo       *src1,
+                                                      const ITensorInfo       *dst,
+                                                      const GEMMLHSMatrixInfo &lhs_info,
+                                                      const GEMMRHSMatrixInfo &rhs_info,
+                                                      const GEMMReshapeInfo   &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -280,11 +305,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -296,7 +323,7 @@
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
@@ -304,10 +331,10 @@
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -317,7 +344,7 @@
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -330,8 +357,7 @@
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
index 4b328e0..4f87096 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -55,25 +56,34 @@
      *                             rhs_info.k0: same as lhs_info.k0
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
index 15493f7..ddbc809 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp

@@ -29,13 +29,12 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -51,45 +50,55 @@
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose);
     ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m();
     const int n = gemm_info.n();
     const int k = gemm_info.k();
 
-    TensorShape tensor_shape0{ src0->tensor_shape() };
+    TensorShape tensor_shape0{src0->tensor_shape()};
     tensor_shape0.set(0, k);
     tensor_shape0.set(1, m);
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
     const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
@@ -99,19 +108,24 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                        const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info,
-                                                        ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool          reinterpret_output_as_3d            = (gemm_info.depth_output_gemm3d() != 0);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32));
 
     TensorInfo tmp_info(*dst);
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -123,7 +137,8 @@
     // Configure kernel window
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
-    Window win                          = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -140,8 +155,13 @@
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                       const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                       const ITensorInfo       *src0,
+                                                       const ITensorInfo       *src1,
+                                                       ITensorInfo             *dst,
+                                                       const GEMMLHSMatrixInfo &lhs_info,
+                                                       const GEMMRHSMatrixInfo &rhs_info,
+                                                       const GEMMReshapeInfo   &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
@@ -154,11 +174,12 @@
     const unsigned int num_dimensionssrc0 = src0->num_dimensions();
     _slide_matrix_b                       = (src1->num_dimensions() >= num_dimensionssrc0);
 
-    auto              padding_info = get_padding_info({ src0, src1, dst });
+    auto              padding_info = get_padding_info({src0, src1, dst});
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config =
+        validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -171,8 +192,10 @@
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
@@ -230,19 +253,19 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                                        const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                        const ITensorInfo       *src1,
+                                                        const ITensorInfo       *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
+                                                        const GEMMRHSMatrixInfo &rhs_info,
+                                                        const GEMMReshapeInfo   &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -252,11 +275,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -268,7 +293,7 @@
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 4;
@@ -281,7 +306,7 @@
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -295,8 +320,7 @@
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
index a16f500..d7b7859 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -64,25 +65,34 @@
      *
      * @note lhs_info.k0 must be equal to rhs_info.k0
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMReshapeInfo   &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info,
-                           const GEMMReshapeInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMReshapeInfo   &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool         _slide_matrix_b{ true };
-    bool         _reinterpret_output_as_3d{ false };
-    unsigned int _k{ 1 };
-    bool         _use_dummy_work_items{ false };
+    bool         _slide_matrix_b{true};
+    bool         _reinterpret_output_as_3d{false};
+    unsigned int _k{1};
+    bool         _use_dummy_work_items{false};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
index 5d552b8..2f1f3b8 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp

@@ -29,14 +29,13 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -54,45 +53,57 @@
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    if(src0->data_type() == DataType::QASYMM8)
+    if (src0->data_type() == DataType::QASYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
 
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16),
+                                    "Only 2,3,4,8,16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m;
     const int n = gemm_info.n;
     const int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -103,11 +114,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
         }
@@ -117,39 +128,42 @@
         }
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
                                     "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
 
     // Checks performed if the dst stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
         }
 
         // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
             // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
 
             // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
             ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
 
-            if(expected_dst_shape.num_dimensions() > 1)
+            if (expected_dst_shape.num_dimensions() > 1)
             {
                 const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -161,30 +175,32 @@
                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
                                                 "vector_sum_row must have the same number of batches of dst tensor");
 
-                if(gemm_info.a_offset != 0)
+                if (gemm_info.a_offset != 0)
                 {
                     TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                     vector_sum_col_shape.collapse_from(1);
 
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
                 }
             }
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         }
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
 
-        if(output_multipliers != nullptr && output_shifts != nullptr)
+        if (output_multipliers != nullptr && output_shifts != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
+            if (output_stage.is_quantized_per_channel)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
@@ -194,9 +210,16 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
 {
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
@@ -211,16 +234,17 @@
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
     {
         reinterpret_output_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
     {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
     }
     else
     {
@@ -229,7 +253,7 @@
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -242,12 +266,14 @@
     num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0;
     num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0;
 
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access);
@@ -255,17 +281,19 @@
         // No access window needed for vector_sum_row
         ARM_COMPUTE_UNUSED(vector_sum_row);
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win_out, bias_access);
         }
 
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
         {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
             AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
+            window_changed =
+                window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access);
         }
     }
 
@@ -275,7 +303,8 @@
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -285,15 +314,22 @@
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                              const GEMMKernelInfo &gemm_info,
-                                                              ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                              ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
+                                                              const ITensorInfo      *src0,
+                                                              const ITensorInfo      *src1,
+                                                              ITensorInfo            *dst,
+                                                              const GEMMKernelInfo   &gemm_info,
+                                                              ITensorInfo            *vector_sum_col,
+                                                              const ITensorInfo      *vector_sum_row,
+                                                              ITensorInfo            *bias,
+                                                              ITensorInfo            *output_multipliers,
+                                                              ITensorInfo            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
 
-    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -307,7 +343,7 @@
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -320,7 +356,8 @@
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -341,8 +378,10 @@
     CLBuildOptions build_opts;
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE");
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
@@ -361,12 +400,12 @@
     std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         kernel_name += "_fused_output_stage_fixedpoint";
         _fuse_output_stage = true;
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
+        if (a_offset != 0 && vector_sum_col != nullptr)
         {
             build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
             build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -377,9 +416,10 @@
         build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
         build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
         // In case of _is_quantized_per_channel, RESULT_MULTIPLIER and RESULT_SHIFT are not utilized, but they are passed as a part of T_QUANTIZE8 macro.
-        if(!_is_quantized_per_channel)
+        if (!_is_quantized_per_channel)
         {
-            build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
+            build_opts.add_option("-DRESULT_MULTIPLIER=" +
+                                  support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
             build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0]));
         }
         else
@@ -432,42 +472,56 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                               const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo    *src0,
+                                                               const ITensorInfo    *src1,
+                                                               const ITensorInfo    *dst,
+                                                               const GEMMKernelInfo &gemm_info,
+                                                               const ITensorInfo    *vector_sum_col,
+                                                               const ITensorInfo    *vector_sum_row,
+                                                               const ITensorInfo    *bias,
+                                                               const ITensorInfo    *output_multipliers,
+                                                               const ITensorInfo    *output_shifts)
 {
     ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
 
     return Status{};
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -479,7 +533,7 @@
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3;
@@ -487,10 +541,10 @@
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
-        const unsigned int idx0                  = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
+        const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0);
         const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom;
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
@@ -515,7 +569,7 @@
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -527,19 +581,19 @@
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
-        if(_reinterpret_input_as_3d)
+        if (_reinterpret_input_as_3d)
         {
             // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
             idx++;
         }
 
-        if(_reinterpret_output_as_3d)
+        if (_reinterpret_output_as_3d)
         {
             // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
             idx++;
         }
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
             add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col);
             add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row);
@@ -548,8 +602,7 @@
             add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
         }
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
index a77604d..1d4696b 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -70,31 +71,44 @@
      * @param[in]  output_shifts      (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
      *                                Supported data types: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
-                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _slide_matrix_b{ true };
-    bool _reinterpret_input_as_3d{ false };
-    bool _reinterpret_output_as_3d{ false };
-    bool _use_dummy_work_items{ false };
-    bool _is_quantized_per_channel{ false };
-    bool _fuse_output_stage{ false };
+    bool _slide_matrix_b{true};
+    bool _reinterpret_input_as_3d{false};
+    bool _reinterpret_output_as_3d{false};
+    bool _use_dummy_work_items{false};
+    bool _is_quantized_per_channel{false};
+    bool _fuse_output_stage{false};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */

diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 792c71d..030c11d 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp

@@ -23,16 +23,15 @@
  */
 #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -47,39 +46,51 @@
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                          const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo    *src0,
+                          const ITensorInfo    *src1,
+                          const ITensorInfo    *dst,
+                          const GEMMKernelInfo &gemm_info,
+                          const ITensorInfo    *vector_sum_col,
+                          const ITensorInfo    *vector_sum_row,
+                          const ITensorInfo    *bias,
+                          const ITensorInfo    *output_multipliers,
+                          const ITensorInfo    *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
 
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.k0 != 4 || lhs_info.k0 != 4, "Only 4 is supported as value for k0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4), "Only 1,2,4 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8), "Only 1,4,8 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4),
+                                    "Only 1,2,4 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8),
+                                    "Only 1,4,8 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM");
 
     const int m = gemm_info.m;
     const int n = gemm_info.n;
     const int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k));
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m));
     }
@@ -90,11 +101,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
-        if(output_stage.type == GEMMLowpOutputStageType::NONE)
+        if (output_stage.type == GEMMLowpOutputStageType::NONE)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
         }
@@ -104,38 +115,41 @@
         }
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0));
     }
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) ||
+                                        (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT),
                                     "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported");
 
     // Checks performed if the dst stage needs to be fused
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]);
         }
 
         // If b_offset == 0, vector_sum_row can be a nullptr
-        if(gemm_info.b_offset != 0)
+        if (gemm_info.b_offset != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
             // Check if mm result is a 3D reinterpretation
-            const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
+            const bool reinterpret_as_3d =
+                expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x();
 
             // Validate input
-            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2]));
+            ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                                 (expected_dst_shape[1] * expected_dst_shape[2]));
             ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]);
 
-            if(expected_dst_shape.num_dimensions() > 1)
+            if (expected_dst_shape.num_dimensions() > 1)
             {
                 const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -147,30 +161,32 @@
                 ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx],
                                                 "vector_sum_row must have the same number of batches of dst tensor");
 
-                if(gemm_info.a_offset != 0)
+                if (gemm_info.a_offset != 0)
                 {
                     TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                     vector_sum_col_shape.collapse_from(1);
 
-                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                    "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                    ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                        vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                    "vector_sum_col tensor must have the same number of batches of "
+                                                    "vector_sum_row_shape or the number of batches must be set to 1");
                 }
             }
         }
 
-        if(dst->total_size() != 0)
+        if (dst->total_size() != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         }
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
 
-        if(output_multipliers != nullptr && output_shifts != nullptr)
+        if (output_multipliers != nullptr && output_shifts != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
             ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-            if(output_stage.is_quantized_per_channel)
+            if (output_stage.is_quantized_per_channel)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0));
                 ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0));
@@ -180,9 +196,16 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                        ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                        ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed)
+std::pair<Status, Window> validate_and_configure_window(const ITensorInfo    *src0,
+                                                        const ITensorInfo    *src1,
+                                                        ITensorInfo          *dst,
+                                                        const GEMMKernelInfo &gemm_info,
+                                                        ITensorInfo          *vector_sum_col,
+                                                        const ITensorInfo    *vector_sum_row,
+                                                        ITensorInfo          *bias,
+                                                        ITensorInfo          *output_multipliers,
+                                                        ITensorInfo          *output_shifts,
+                                                        ElementsProcessed    &num_elements_processed)
 {
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
 
@@ -200,9 +223,10 @@
     reinterpret_output_as_3d = false;
     // dst tensor auto initialization if not yet initialized
     const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info);
-    if(output_stage.type != GEMMLowpOutputStageType::NONE)
+    if (output_stage.type != GEMMLowpOutputStageType::NONE)
     {
-        auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
+        auto_init_if_empty(
+            *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type));
     }
     else
     {
@@ -211,7 +235,7 @@
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -224,11 +248,12 @@
     num_elems_processed_per_iteration_x = 1;
     num_elems_processed_per_iteration_y = 1;
 
-    win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
-        if(gemm_info.a_offset != 0)
+        if (gemm_info.a_offset != 0)
         {
             AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access);
@@ -236,17 +261,19 @@
         // No access window needed for vector_sum_row
         ARM_COMPUTE_UNUSED(vector_sum_row);
 
-        if(bias != nullptr)
+        if (bias != nullptr)
         {
             AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x);
             window_changed = window_changed || update_window_and_padding(win, bias_access);
         }
 
-        if(output_multipliers != nullptr && output_stage.is_quantized_per_channel)
+        if (output_multipliers != nullptr && output_stage.is_quantized_per_channel)
         {
-            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x);
+            AccessWindowHorizontal output_multipliers_access(output_multipliers, 0,
+                                                             num_elems_processed_per_iteration_x);
             AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x);
-            window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
+            window_changed =
+                window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access);
         }
     }
 
@@ -278,7 +305,8 @@
     collapsed.set(Window::DimX, x_dimension);
     collapsed.set(Window::DimY, y_dimension);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -288,15 +316,22 @@
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst,
-                                                                  const GEMMKernelInfo &gemm_info,
-                                                                  ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias,
-                                                                  ITensorInfo *output_multipliers, ITensorInfo *output_shifts)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context,
+                                                                  const ITensorInfo      *src0,
+                                                                  const ITensorInfo      *src1,
+                                                                  ITensorInfo            *dst,
+                                                                  const GEMMKernelInfo   &gemm_info,
+                                                                  ITensorInfo            *vector_sum_col,
+                                                                  const ITensorInfo      *vector_sum_row,
+                                                                  ITensorInfo            *bias,
+                                                                  ITensorInfo            *output_multipliers,
+                                                                  ITensorInfo            *output_shifts)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                  output_multipliers, output_shifts));
 
-    auto                          padding_info = get_padding_info({ src0, src1, dst, vector_sum_row });
+    auto                          padding_info = get_padding_info({src0, src1, dst, vector_sum_row});
     const GEMMRHSMatrixInfo       rhs_info     = gemm_info.rhs_info;
     const GEMMLHSMatrixInfo       lhs_info     = gemm_info.lhs_info;
     const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage;
@@ -313,7 +348,8 @@
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                    output_multipliers, output_shifts, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -334,18 +370,19 @@
     build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
     build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
     build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_mmul");
 
-    if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+    if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
     {
         build_opts.add_option("-DFUSED_OUTPUT_STAGE_FIXED_POINT");
         _fuse_output_stage = true;
         // If a_offset == 0, vector_sum_col can be a nullptr
-        if(a_offset != 0 && vector_sum_col != nullptr)
+        if (a_offset != 0 && vector_sum_col != nullptr)
         {
             build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
             build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -396,42 +433,54 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                                                                   const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo    *src0,
+                                                                   const ITensorInfo    *src1,
+                                                                   const ITensorInfo    *dst,
+                                                                   const GEMMKernelInfo &gemm_info,
+                                                                   const ITensorInfo    *vector_sum_col,
+                                                                   const ITensorInfo    *vector_sum_row,
+                                                                   const ITensorInfo    *bias,
+                                                                   const ITensorInfo    *output_multipliers,
+                                                                   const ITensorInfo    *output_shifts)
 {
     ElementsProcessed num_elements_processed{};
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
-                                                              dst->clone().get(),
-                                                              gemm_info,
-                                                              vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
-                                                              vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
-                                                              bias != nullptr ? bias->clone().get() : nullptr,
-                                                              output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
-                                                              output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
-                                                              num_elements_processed)
-                                .first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias,
+                                                   output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info,
+                                      vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr,
+                                      vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr,
+                                      bias != nullptr ? bias->clone().get() : nullptr,
+                                      output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr,
+                                      output_shifts != nullptr ? output_shifts->clone().get() : nullptr,
+                                      num_elements_processed)
+            .first);
 
     return Status{};
 }
 
-void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                               const Window     &window,
+                                                               cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    auto       dst            = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -449,7 +498,7 @@
         add_3d_tensor_nhw_argument(idx, src1);
 
         // Bias buffer (_add_bias == true)
-        if(src2 != nullptr)
+        if (src2 != nullptr)
         {
             add_3d_tensor_nhw_argument(idx, src2);
         }
@@ -461,21 +510,20 @@
         _kernel.setArg<cl_int>(idx++, _n);
         _kernel.setArg<cl_int>(idx++, _k);
 
-        if(_fuse_output_stage)
+        if (_fuse_output_stage)
         {
-            if(vector_sum_col != nullptr)
+            if (vector_sum_col != nullptr)
             {
                 add_3d_tensor_nhw_argument(idx, vector_sum_col);
             }
-            if(vector_sum_row != nullptr)
+            if (vector_sum_row != nullptr)
             {
                 add_3d_tensor_nhw_argument(idx, vector_sum_row);
             }
         }
 
         enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
index 0ae549c..fc8b731 100644
--- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/IClKernel.h"
 
@@ -65,29 +66,42 @@
      * @param[in]  output_multipliers (Optional) Output multipliers tensor. Supported data types: S32.
      * @param[in]  output_shifts      (Optional) Output shifts tensor. Supported data types: S32.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                   ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr,
-                   ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   ITensorInfo            *dst,
+                   const GEMMKernelInfo   &gemm_info,
+                   ITensorInfo            *vector_sum_col     = nullptr,
+                   const ITensorInfo      *vector_sum_row     = nullptr,
+                   ITensorInfo            *bias               = nullptr,
+                   ITensorInfo            *output_multipliers = nullptr,
+                   ITensorInfo            *output_shifts      = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info,
-                           const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr,
-                           const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr);
+    static Status validate(const ITensorInfo    *src0,
+                           const ITensorInfo    *src1,
+                           const ITensorInfo    *dst,
+                           const GEMMKernelInfo &gemm_info,
+                           const ITensorInfo    *vector_sum_col     = nullptr,
+                           const ITensorInfo    *vector_sum_row     = nullptr,
+                           const ITensorInfo    *bias               = nullptr,
+                           const ITensorInfo    *output_multipliers = nullptr,
+                           const ITensorInfo    *output_shifts      = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _fuse_output_stage{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _fuse_output_stage{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */

diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
index 9ec0b51..d93dbde 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp

@@ -28,11 +28,10 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -44,12 +43,16 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                          int32_t a_offset, int32_t b_offset)
+Status validate_arguments(const ITensorInfo *mm_result,
+                          const ITensorInfo *vector_sum_col,
+                          const ITensorInfo *vector_sum_row,
+                          const ITensorInfo *bias,
+                          int32_t            a_offset,
+                          int32_t            b_offset)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -57,26 +60,28 @@
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -87,13 +92,15 @@
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
@@ -108,29 +115,34 @@
 }
 
 void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context,
-                                                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                   int32_t k, int32_t a_offset, int32_t b_offset)
+                                                   const ITensorInfo      *mm_result,
+                                                   const ITensorInfo      *vector_sum_col,
+                                                   const ITensorInfo      *vector_sum_row,
+                                                   const ITensorInfo      *bias,
+                                                   int32_t                 k,
+                                                   int32_t                 a_offset,
+                                                   int32_t                 b_offset)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
 
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias });
+    auto padding_info = get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias});
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0));
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
         build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -138,8 +150,10 @@
     // If b_offset == 0, vector_sum_row can be a nullptr
     build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
 
     std::string kernel_name("gemmlowp_offset_contribution");
@@ -165,10 +179,15 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                    int32_t a_offset, int32_t b_offset)
+Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result,
+                                                    const ITensorInfo *vector_sum_col,
+                                                    const ITensorInfo *vector_sum_row,
+                                                    const ITensorInfo *bias,
+                                                    int32_t            a_offset,
+                                                    int32_t            b_offset)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset));
     return Status{};
 }
 
@@ -177,10 +196,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window);
 
-    const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto bias           = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto mm_result      = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST));
 
     Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -209,8 +231,7 @@
         add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice);
 
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
index 48926e2..2080a3a 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h

@@ -67,15 +67,25 @@
      * @param[in]      b_offset        Offset to be added to each element of the matrix B.
      */
     void configure(const CLCompileContext &compile_context,
-                   const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                   int32_t k, int32_t a_offset, int32_t b_offset);
+                   const ITensorInfo      *mm_result,
+                   const ITensorInfo      *vector_sum_col,
+                   const ITensorInfo      *vector_sum_row,
+                   const ITensorInfo      *bias,
+                   int32_t                 k,
+                   int32_t                 a_offset,
+                   int32_t                 b_offset);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpOffsetContributionKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset);
+    static Status validate(const ITensorInfo *mm_result,
+                           const ITensorInfo *vector_sum_col,
+                           const ITensorInfo *vector_sum_row,
+                           const ITensorInfo *bias,
+                           int32_t            a_offset,
+                           int32_t            b_offset);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
index c5fb54f..26f479f 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp

@@ -34,7 +34,6 @@
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -46,12 +45,20 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst,
-                          int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status validate_arguments(const ITensorInfo             *mm_result,
+                          const ITensorInfo             *vector_sum_col,
+                          const ITensorInfo             *vector_sum_row,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          int32_t                        a_offset,
+                          int32_t                        b_offset,
+                          const GEMMLowpOutputStageInfo &output_stage,
+                          const ITensorInfo             *output_multipliers,
+                          const ITensorInfo             *output_shifts)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -62,33 +69,35 @@
     ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1);
-    if(output_stage.is_quantized_per_channel)
+    if (output_stage.is_quantized_per_channel)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0));
         ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0));
     }
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0));
     }
 
     // If b_offset == 0, vector_sum_row can be a nullptr
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32);
 
         // Check if input is a 3D reinterpretation
-        const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+        const bool reinterpret_as_3d =
+            mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
         // Validate input
-        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2)));
+        ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) !=
+                                                             (mm_result->dimension(1) * mm_result->dimension(2)));
         ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1));
 
         TensorShape output_shape = mm_result->tensor_shape();
-        if(output_shape.num_dimensions() > 1)
+        if (output_shape.num_dimensions() > 1)
         {
             const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2;
 
@@ -99,20 +108,22 @@
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx],
                                             "mm_result tensor must have the same number of batches of output tensor");
 
-            if(a_offset != 0)
+            if (a_offset != 0)
             {
                 TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape();
                 vector_sum_col_shape.collapse_from(1);
 
-                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1],
-                                                "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1");
+                ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 &&
+                                                    vector_sum_col_shape[1] != vector_sum_row_shape[1],
+                                                "vector_sum_col tensor must have the same number of batches of "
+                                                "vector_sum_row_shape or the number of batches must be set to 1");
             }
         }
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE);
     // Checks performed when output is configured
-    if((dst != nullptr) && (dst->total_size() != 0))
+    if ((dst != nullptr) && (dst->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type());
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
@@ -120,7 +131,8 @@
     }
 
     ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(),
+                                    "per channel quantization info is incorrect");
 
     return Status{};
 }
@@ -131,16 +143,26 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context,
-                                                              const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                                                              int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                              const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *mm_result,
+                                                              const ITensorInfo             *vector_sum_col,
+                                                              const ITensorInfo             *vector_sum_row,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
+                                                              int32_t                        k,
+                                                              int32_t                        a_offset,
+                                                              int32_t                        b_offset,
+                                                              const GEMMLowpOutputStageInfo &output_stage,
+                                                              const ITensorInfo             *output_multipliers,
+                                                              const ITensorInfo             *output_shifts)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                  b_offset, output_stage, output_multipliers, output_shifts));
 
-    auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts });
+    auto padding_info =
+        get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts});
 
     const int min = output_stage.gemmlowp_min_bound;
     const int max = output_stage.gemmlowp_max_bound;
@@ -148,9 +170,8 @@
     _is_quantized_per_channel = output_stage.is_quantized_per_channel;
 
     // Check if input is a 3D reinterpretation
-    const bool reinterpret_as_3d = vector_sum_row != nullptr
-                                   && mm_result->num_dimensions() > 1
-                                   && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
+    const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 &&
+                                   mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x();
 
     // Auto initialize the output
     auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type));
@@ -160,10 +181,11 @@
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration));
 
     // If a_offset == 0, vector_sum_col can be a nullptr
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset));
         build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES");
@@ -171,8 +193,10 @@
     // If b_offset == 0, vector_sum_row can be a nullptr
     build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset));
     build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k));
-    build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
-    build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1)));
+    build_opts.add_option_if(reinterpret_as_3d,
+                             "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2)));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
     build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset));
     build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0]));
@@ -210,26 +234,42 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias,
-                                                               const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                                                               const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts)
+Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo             *mm_result,
+                                                               const ITensorInfo             *vector_sum_col,
+                                                               const ITensorInfo             *vector_sum_row,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
+                                                               int32_t                        a_offset,
+                                                               int32_t                        b_offset,
+                                                               const GEMMLowpOutputStageInfo &output_stage,
+                                                               const ITensorInfo             *output_multipliers,
+                                                               const ITensorInfo             *output_shifts)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset,
+                                                   b_offset, output_stage, output_multipliers, output_shifts));
     return Status{};
 }
 
-void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto mm_result          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias               = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    const auto vector_sum_col     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
-    const auto vector_sum_row     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
-    const auto output_shifts      = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
-    const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
-    auto       dst                = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto mm_result =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    const auto vector_sum_col =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM));
+    const auto vector_sum_row =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM));
+    const auto output_shifts =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS));
+    const auto output_multipliers =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
@@ -260,8 +300,7 @@
         add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice);
         add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
index cee0447..97ee9bc 100644
--- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h

@@ -66,23 +66,40 @@
      * @param[in]  output_shifts      Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM).
      *                                Supported data types: S32
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst,
-                   int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage,
-                   const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *mm_result,
+                   const ITensorInfo             *vector_sum_col,
+                   const ITensorInfo             *vector_sum_row,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   int32_t                        k,
+                   int32_t                        a_offset,
+                   int32_t                        b_offset,
+                   const GEMMLowpOutputStageInfo &output_stage,
+                   const ITensorInfo             *output_multipliers,
+                   const ITensorInfo             *output_shifts);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset,
-                           int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts);
+    static Status validate(const ITensorInfo             *mm_result,
+                           const ITensorInfo             *vector_sum_col,
+                           const ITensorInfo             *vector_sum_row,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           int32_t                        a_offset,
+                           int32_t                        b_offset,
+                           const GEMMLowpOutputStageInfo &output_stage,
+                           const ITensorInfo             *output_multipliers,
+                           const ITensorInfo             *output_shifts);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _is_quantized_per_channel{ false };
+    bool _is_quantized_per_channel{false};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
index 3975438..7b7beab 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp

@@ -27,15 +27,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,20 +46,23 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -75,7 +77,9 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo             *src,
+                                                                    const ITensorInfo             *bias,
+                                                                    const ITensorInfo             *dst,
                                                                     const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -84,14 +88,17 @@
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext        &compile_context,
+                                                                   const ITensorInfo             *src,
+                                                                   const ITensorInfo             *bias,
+                                                                   ITensorInfo                   *dst,
                                                                    const GEMMLowpOutputStageInfo *info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
@@ -103,19 +110,26 @@
     auto           max = info->gemmlowp_max_bound;
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset));
     build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier));
     build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max),
-                             "-DMAX_BOUND=" + support::cpp11::to_string(max));
+    build_opts.add_option_if(
+        (min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMIN_BOUND=" + support::cpp11::to_string(min));
+    build_opts.add_option_if(
+        (max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) &&
+            (min != max),
+        "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
 
     // Create kernel
-    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint";
+    const std::string kernel_name = (info->output_data_type == DataType::QSYMM16)
+                                        ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16"
+                                        : "gemmlowp_output_stage_quantize_down_fixedpoint";
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -129,14 +143,18 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack      &tensors,
+                                                                const Window     &window,
+                                                                cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Create src window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -144,7 +162,7 @@
 
     // Setup bias slice
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -158,8 +176,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
index 69b5fc5..71c9f4b 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h

@@ -60,14 +60,21 @@
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16.
      * @param[in]  info            Output stage info. Used to pass the quantized output data type
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
index f379698..52ebd32 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp

@@ -27,15 +27,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,23 +46,31 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))
-                                || info->gemmlowp_min_bound > info->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) &&
+                                (info->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) ||
+        info->gemmlowp_min_bound > info->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
@@ -78,7 +85,9 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst,
+Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo             *src,
+                                                               const ITensorInfo             *bias,
+                                                               const ITensorInfo             *dst,
                                                                const GEMMLowpOutputStageInfo *info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
@@ -87,14 +96,17 @@
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext        &compile_context,
+                                                              const ITensorInfo             *src,
+                                                              const ITensorInfo             *bias,
+                                                              ITensorInfo                   *dst,
                                                               const GEMMLowpOutputStageInfo *info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type));
@@ -107,7 +119,8 @@
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier));
     build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
@@ -130,14 +143,18 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Create input window
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
@@ -145,7 +162,7 @@
 
     // Setup bias slice
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -159,8 +176,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
index 8eda24d..057c667 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h

@@ -62,14 +62,21 @@
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            Output stage info. Used to pass the quantized output data type
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
index 5d54db2..31434ce 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp

@@ -26,15 +26,14 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -46,25 +45,34 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status validate_arguments(const ITensorInfo             *src,
+                          const ITensorInfo             *bias,
+                          const ITensorInfo             *dst,
+                          const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
-    ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))
-                                || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
+    ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) &&
+                                (output_stage->output_data_type != DataType::QASYMM8_SIGNED));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_max_bound >
+        std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        output_stage->gemmlowp_min_bound <
+            std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) ||
+        output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound);
 
     // Check biases if exist
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type,
+                                        "Mismatching output data type");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
     }
 
@@ -77,7 +85,10 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage)
+Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo             *src,
+                                                        const ITensorInfo             *bias,
+                                                        const ITensorInfo             *dst,
+                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage));
@@ -85,14 +96,17 @@
     return Status{};
 }
 
-void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst,
+void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext        &compile_context,
+                                                       const ITensorInfo             *src,
+                                                       const ITensorInfo             *bias,
+                                                       ITensorInfo                   *dst,
                                                        const GEMMLowpOutputStageInfo *output_stage)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage));
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type));
@@ -104,13 +118,18 @@
     auto           max = output_stage->gemmlowp_max_bound;
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset));
     build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier));
     build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift));
-    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+    build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
                              "-DMIN_BOUND=" + support::cpp11::to_string(min));
-    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max),
+    build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(
+                                        output_stage->output_data_type))) &&
+                                 (min != max),
                              "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
@@ -135,15 +154,17 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    const auto bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice     = collapsed.first_slice_window_3D();
 
     unsigned int idx1 = num_arguments_per_3D_tensor();
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         Window biases_slice(slice);
         biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1));
@@ -157,8 +178,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx1, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
index 84c5060..e639080 100644
--- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h

@@ -62,14 +62,21 @@
      * @param[out] dst             Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  output_stage    GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo *output_stage);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo *output_stage);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -77,4 +84,4 @@
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */

diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
index ea88b48..ee4a191 100644
--- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp

@@ -32,7 +32,6 @@
 
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -47,12 +46,15 @@
 Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(1),
+            "Output vector must have length equal to the number of rows of the input matrix");
     }
     return Status{};
 }
@@ -60,12 +62,15 @@
 Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->dimension(0) != src->dimension(0),
+            "Output vector must have length equal to the number of columns of the input matrix");
     }
     return Status{};
 }
@@ -76,7 +81,10 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_a,
+                                                 ITensorInfo                       *vector_sum_row,
+                                                 const GEMMLowpReductionKernelInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row);
@@ -85,7 +93,7 @@
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32);
 
-    auto padding_info = get_padding_info({ mtx_a, vector_sum_row });
+    auto padding_info = get_padding_info({mtx_a, vector_sum_row});
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
@@ -120,7 +128,9 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
+Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo                 *mtx_a,
+                                                  const ITensorInfo                 *vector_sum_row,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row));
@@ -133,8 +143,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY);
     Window slice_in  = collapsed.first_slice_window_2D();
@@ -151,11 +162,13 @@
         add_3D_tensor_argument(idx, src, slice_in);
         add_2D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
+    } while (collapsed.slide_window_slice_2D(slice_out));
 }
 
-void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext            &compile_context,
+                                                 const ITensorInfo                 *mtx_b,
+                                                 ITensorInfo                       *vector_sum_col,
+                                                 const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
@@ -163,14 +176,15 @@
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32);
 
-    auto padding_info = get_padding_info({ mtx_b, vector_sum_col });
+    auto padding_info = get_padding_info({mtx_b, vector_sum_col});
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0));
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0)));
     build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type()));
@@ -192,7 +206,9 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
+Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo                 *mtx_b,
+                                                  const ITensorInfo                 *vector_sum_col,
+                                                  const GEMMLowpReductionKernelInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col));
@@ -205,8 +221,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY);
 
@@ -222,8 +239,7 @@
         add_3D_tensor_argument(idx, src, slice_in);
         add_2D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_out, lws_hint());
-    }
-    while(collapsed.slide_window_slice_2D(slice_out));
+    } while (collapsed.slide_window_slice_2D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
index 7119b5f..c81543e 100644
--- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h
+++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -52,7 +53,10 @@
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0;
+    virtual void configure(const CLCompileContext            &compile_context,
+                           const ITensorInfo                 *input,
+                           ITensorInfo                       *output,
+                           const GEMMLowpReductionKernelInfo &info) = 0;
 };
 
 /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A.
@@ -74,14 +78,18 @@
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override;
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_a,
+                   ITensorInfo                       *vector_sum_row,
+                   const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -106,14 +114,18 @@
      *                             - scalar       Scalar value to multiply each reduced column/row by.
      *                             - mul_byscalar True if each reduced column/row must be multiplied by a scalar value.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override;
+    void configure(const CLCompileContext            &compile_context,
+                   const ITensorInfo                 *mtx_b,
+                   ITensorInfo                       *vector_sum_col,
+                   const GEMMLowpReductionKernelInfo &info) override;
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
+    static Status
+    validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
index b8997df..fd23aa9 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp

@@ -29,10 +29,11 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -51,7 +52,13 @@
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
@@ -59,15 +66,20 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native");
 
@@ -82,7 +94,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k);
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -91,15 +103,16 @@
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m);
     }
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -107,9 +120,10 @@
         }
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -117,9 +131,14 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
@@ -132,17 +151,18 @@
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(reinterpret_input_as_3d == reinterpret_output_as_3d)
+    if (reinterpret_input_as_3d == reinterpret_output_as_3d)
     {
         reinterpret_output_as_3d = false;
     }
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -155,34 +175,34 @@
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    win     = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
-    win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    win_out =
+        calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowStatic src0_access(src0, 0, 0,
-                                   src0->dimension(0),
-                                   src0->dimension(1));
-    AccessWindowStatic src1_access(src1, 0, 0,
-                                   ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x),
-                                   src1->dimension(1));
-    AccessWindowStatic dst_access(dst, 0, 0,
-                                  dst->dimension(0),
-                                  dst->dimension(1));
+    AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1));
+    AccessWindowStatic src1_access(
+        src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1));
+    AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1));
 
-    if(src2 != nullptr)
+    if (src2 != nullptr)
     {
         const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x;
 
-        AccessWindowStatic src2_access(src2, 0, 0,
-                                       ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
+        AccessWindowStatic src2_access(src2, 0, 0, ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x),
                                        src2->dimension(1));
 
-        window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);                          // window used to update the padding requirements of dst tensor
+        window_changed = update_window_and_padding(win, src0_access, src1_access,
+                                                   src2_access) || // window used by the execute_window_loop
+                         update_window_and_padding(
+                             win_out, dst_access); // window used to update the padding requirements of dst tensor
     }
     else
     {
-        window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
-                         update_window_and_padding(win_out, dst_access);             // window used to update the padding requirements of dst tensor
+        window_changed =
+            update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop
+            update_window_and_padding(win_out,
+                                      dst_access); // window used to update the padding requirements of dst tensor
     }
 
     // Collapse along the Z direction
@@ -191,7 +211,8 @@
     const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u);
     collapsed                                = win.collapse(win, dimension_to_collapse);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -201,19 +222,26 @@
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
+void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext  &compile_context,
+                                                 ITensorInfo             *src0,
+                                                 ITensorInfo             *src1,
+                                                 ITensorInfo             *src2,
+                                                 ITensorInfo             *dst,
+                                                 float                    alpha,
                                                  float                    beta,
                                                  const GEMMLHSMatrixInfo &lhs_info,
-                                                 const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                 const GEMMRHSMatrixInfo &rhs_info,
+                                                 const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info         = get_padding_info({ src0, dst });
+    auto padding_info         = get_padding_info({src0, dst});
     _reinterpret_input_as_3d  = gemm_info.reinterpret_input_as_3d;
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
@@ -221,7 +249,7 @@
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if(_reinterpret_input_as_3d == _reinterpret_output_as_3d)
+    if (_reinterpret_input_as_3d == _reinterpret_output_as_3d)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -234,7 +262,8 @@
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed);
+    auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info,
+                                                    rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -260,14 +289,17 @@
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
     build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+    build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS");
     build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0));
@@ -275,9 +307,13 @@
     build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_native");
 
@@ -314,21 +350,23 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo       *src0,
+                                                  const ITensorInfo       *src1,
+                                                  const ITensorInfo       *src2,
+                                                  const ITensorInfo       *dst,
+                                                  float                    alpha,
+                                                  float                    beta,
                                                   const GEMMLHSMatrixInfo &lhs_info,
-                                                  const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                  const GEMMRHSMatrixInfo &rhs_info,
+                                                  const GEMMKernelInfo    &gemm_info)
 {
     ElementsProcessed num_elements_processed{};
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
                                                               src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info,
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info,
                                                               num_elements_processed)
-                                .first);
+                                    .first);
 
     return Status{};
 }
@@ -338,15 +376,18 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -358,11 +399,11 @@
     slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1));
     slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1));
 
-    if(_reinterpret_input_as_3d)
+    if (_reinterpret_input_as_3d)
     {
         // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor
         unsigned int idx0;
-        if(_add_bias)
+        if (_add_bias)
         {
             idx0 = 4 * num_arguments_per_2D_tensor() + 7;
         }
@@ -374,11 +415,11 @@
         _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad));
     }
 
-    if(_reinterpret_output_as_3d)
+    if (_reinterpret_output_as_3d)
     {
         // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor
         unsigned int idx0;
-        if(_add_bias)
+        if (_add_bias)
         {
             idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0);
         }
@@ -395,7 +436,7 @@
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -403,7 +444,7 @@
         unsigned int idx = 0;
         add_2D_tensor_argument(idx, src0, slice);
         add_2D_tensor_argument(idx, src1, slice_b);
-        if(_add_bias)
+        if (_add_bias)
         {
             add_2D_tensor_argument(idx, src2, slice);
         }
@@ -411,7 +452,7 @@
 
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2]));
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
-        if(_add_bias)
+        if (_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
@@ -423,8 +464,7 @@
         _kernel.setArg<cl_int>(idx++, _k);
 
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
index 80f8355..da6c9a5 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -58,7 +59,13 @@
      *                             rhs_info.k0: same of lhs_info.k0
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
                    const GEMMLHSMatrixInfo &lhs_info,
                    const GEMMRHSMatrixInfo &rhs_info,
                    const GEMMKernelInfo    &gemm_info);
@@ -68,7 +75,13 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -76,14 +89,14 @@
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _slide_matrix_b{ true };
-    bool       _reinterpret_input_as_3d{ false };
-    bool       _reinterpret_output_as_3d{ false };
-    bool       _use_dummy_work_items{ false };
-    bool       _add_bias{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
index d72d29e..4fe6bdd 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp

@@ -29,10 +29,11 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -52,7 +53,13 @@
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
@@ -61,42 +68,50 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3),
+                                    "Only 2,3,4,8,16 are supported for m0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32),
+                                    "Mixed precision only supported for F16 data type");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
     const unsigned int m = gemm_info.m;
     const unsigned int n = gemm_info.n;
     const unsigned int k = gemm_info.k;
 
-    TensorShape tensor_shape0{ src0->tensor_shape() };
+    TensorShape tensor_shape0{src0->tensor_shape()};
     tensor_shape0.set(0, k);
     tensor_shape0.set(1, m);
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -107,15 +122,18 @@
     const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0);
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped0 =
+        src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -123,9 +141,14 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
-                                                        const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+                                                        const GEMMKernelInfo    &gemm_info,
+                                                        ElementsProcessed       &num_elements_processed)
 {
     ARM_COMPUTE_UNUSED(src0, src1, src2);
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
@@ -134,7 +157,7 @@
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -147,7 +170,8 @@
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -164,18 +188,26 @@
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context,
-                                                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext  &compile_context,
+                                                   const ITensorInfo       *src0,
+                                                   const ITensorInfo       *src1,
+                                                   const ITensorInfo       *src2,
+                                                   ITensorInfo             *dst,
+                                                   float                    alpha,
+                                                   float                    beta,
+                                                   const GEMMLHSMatrixInfo &lhs_info,
+                                                   const GEMMRHSMatrixInfo &rhs_info,
+                                                   const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info         = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info         = get_padding_info({src0, src1, src2, dst});
     _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
     _use_dummy_work_items     = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device());
     _add_bias                 = src2 != nullptr;
@@ -188,14 +220,9 @@
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src0->clone().get(),
-                                                    src1->clone().get(),
-                                                    (src2 != nullptr) ? src2->clone().get() : nullptr,
-                                                    dst->clone().get(),
-                                                    lhs_info,
-                                                    rhs_info,
-                                                    gemm_info,
-                                                    num_elements_processed);
+    auto win_config = validate_and_configure_window(
+        src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+        lhs_info, rhs_info, gemm_info, num_elements_processed);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);
 
@@ -213,12 +240,15 @@
 
     // Create build options
     CLBuildOptions build_opts;
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
-    build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1)));
+    build_opts.add_option_if(_reinterpret_output_as_3d,
+                             "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2)));
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
     build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2)));
     build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE");
@@ -229,7 +259,9 @@
     build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT");
     build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1)));
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
-    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type)));
+    build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision
+                                                            ? get_cl_type_from_data_type(DataType::F32)
+                                                            : get_cl_type_from_data_type(data_type)));
     build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0));
     build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0));
     build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0));
@@ -237,9 +269,13 @@
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_");
     kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_";
@@ -287,9 +323,15 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo       *src0,
+                                                    const ITensorInfo       *src1,
+                                                    const ITensorInfo       *src2,
+                                                    const ITensorInfo       *dst,
+                                                    float                    alpha,
+                                                    float                    beta,
                                                     const GEMMLHSMatrixInfo &lhs_info,
-                                                    const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
     return Status{};
@@ -300,15 +342,18 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -324,12 +369,14 @@
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
@@ -337,7 +384,7 @@
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -348,7 +395,7 @@
         add_2D_tensor_argument(idx, src0, slice);
 
         // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
@@ -370,7 +417,7 @@
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2]));
 
         // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2]));
         }
@@ -379,7 +426,7 @@
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2]));
 
         // Cross-plan padding (if _reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d)
+        if (_reinterpret_output_as_3d)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad));
         }
@@ -393,8 +440,7 @@
 
         // Dispatch kernel
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
index 8d25412..30928c4 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h

@@ -24,12 +24,12 @@
 #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H
 
+#include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
-#include "arm_compute/core/KernelDescriptors.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -83,16 +83,29 @@
      *
      * @note lhs_info.k0 must be equal to rhs_info.k0
      */
-    void configure(const ClCompileContext &compile_context,
-                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -100,14 +113,14 @@
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _slide_matrix_b{ true };
-    bool       _reinterpret_output_as_3d{ false };
-    bool       _use_dummy_work_items{ false };
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
index b34c17c..1b19f1e 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp

@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,24 +47,36 @@
 {
 using ElementsProcessed = Steps;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          const GEMMRHSMatrixInfo &rhs_info,
+                          const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr)
-                                    && (!gemm_info.broadcast_bias),
-                                    "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) &&
+            (!gemm_info.broadcast_bias),
+        "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
@@ -71,19 +84,20 @@
     const unsigned int n = gemm_info.n;
     const unsigned int k = gemm_info.k;
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -93,10 +107,11 @@
 
     const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
 
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
-    if(gemm_info.reinterpret_input_as_3d)
+    if (gemm_info.reinterpret_input_as_3d)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -106,9 +121,10 @@
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -116,8 +132,14 @@
     return Status{};
 }
 
-Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
-                                     const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed)
+Window validate_and_configure_window(ITensorInfo             *src0,
+                                     ITensorInfo             *src1,
+                                     ITensorInfo             *src2,
+                                     ITensorInfo             *dst,
+                                     const GEMMLHSMatrixInfo &lhs_info,
+                                     const GEMMRHSMatrixInfo &rhs_info,
+                                     const GEMMKernelInfo    &gemm_info,
+                                     ElementsProcessed       &num_elements_processed)
 {
     ARM_COMPUTE_UNUSED(src0, src1, src2);
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
@@ -128,14 +150,14 @@
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
     // This approach should only be used when the input/dst tensors have pad on the y direction
-    if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
+    if ((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y)
     {
         reinterpret_output_as_3d = false;
     }
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -148,7 +170,8 @@
     num_elems_processed_per_iteration_x = rhs_info.n0;
     num_elems_processed_per_iteration_y = lhs_info.m0;
 
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -164,14 +187,22 @@
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context,
-                                                          const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                                                          const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext  &compile_context,
+                                                          const ITensorInfo       *src0,
+                                                          const ITensorInfo       *src1,
+                                                          const ITensorInfo       *src2,
+                                                          ITensorInfo             *dst,
+                                                          float                    alpha,
+                                                          float                    beta,
+                                                          const GEMMLHSMatrixInfo &lhs_info,
+                                                          const GEMMRHSMatrixInfo &rhs_info,
+                                                          const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
@@ -182,11 +213,11 @@
     _export_to_cl_image       = rhs_info.export_to_cl_image;
     _has_pad_y                = gemm_info.has_pad_y;
 
-    auto padding_info = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info = get_padding_info({src0, src1, src2, dst});
 
     // In case both input and dst have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
-    if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
+    if ((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y)
     {
         _reinterpret_input_as_3d  = false;
         _reinterpret_output_as_3d = false;
@@ -199,8 +230,9 @@
     ElementsProcessed num_elements_processed{};
 
     // Configure kernel window
-    Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), lhs_info, rhs_info, gemm_info,
-                                               num_elements_processed);
+    Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(),
+                                               (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(),
+                                               lhs_info, rhs_info, gemm_info, num_elements_processed);
     ICLKernel::configure_internal(win);
 
     // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true,
@@ -225,7 +257,8 @@
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
@@ -240,17 +273,23 @@
     build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0));
     build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));
     build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
-    if(_has_pad_y)
+    if (_has_pad_y)
     {
         build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D");
         build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D");
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
-        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d));
+        build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d,
+                                 "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d));
     }
 
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
-    build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DACTIVATION_TYPE=" +
+                                 lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
+    build_opts.add_option_if(gemm_info.activation_info.enabled(),
+                             "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
     std::string kernel_name("gemm_mm_reshaped_only_rhs_");
     kernel_name += rhs_info.transpose ? "t" : "nt";
@@ -294,28 +333,39 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo       *src0,
+                                                           const ITensorInfo       *src1,
+                                                           const ITensorInfo       *src2,
+                                                           const ITensorInfo       *dst,
+                                                           float                    alpha,
+                                                           float                    beta,
                                                            const GEMMLHSMatrixInfo &lhs_info,
-                                                           const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                           const GEMMRHSMatrixInfo &rhs_info,
+                                                           const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
     return Status{};
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack      &tensors,
+                                                       const Window     &window,
+                                                       cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -341,12 +391,14 @@
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     do
@@ -354,7 +406,7 @@
         Window slice_b = slice;
         // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2
         // This scenario can happen when the matrix multiplication is used to perform a convolution operation
-        if(!_slide_matrix_b)
+        if (!_slide_matrix_b)
         {
             slice_b = slice_matrix_b;
         }
@@ -365,7 +417,7 @@
         add_2D_tensor_argument(idx, src0, slice);
 
         // RHS buffer or RHS OpenCL image (_export_to_cl_image == true)
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
@@ -387,22 +439,23 @@
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size]));
 
         // Bias stride_z (if _add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
-            _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
+            _kernel.setArg<cl_uint>(idx++,
+                                    static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size]));
         }
 
         // dst stride_z
         _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size]));
 
         // Cross-plan padding (if _reinterpret_input_as_3d = true)
-        if(_reinterpret_input_as_3d && _has_pad_y)
+        if (_reinterpret_input_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs));
         }
 
         // Cross-plan padding (if reinterpret_output_as_3d = true)
-        if(_reinterpret_output_as_3d && _has_pad_y)
+        if (_reinterpret_output_as_3d && _has_pad_y)
         {
             _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out));
         }
@@ -413,8 +466,7 @@
         _kernel.setArg<cl_int>(idx++, _k);
 
         enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
index 471160c..e8fd78d 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h

@@ -24,12 +24,12 @@
 #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H
 
+#include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 
-#include "arm_compute/core/KernelDescriptors.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -74,32 +74,46 @@
      *                             rhs_info.transpose: true,false
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context,
-                   const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
-                   const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    void configure(const ClCompileContext  &compile_context,
+                   const ITensorInfo       *src0,
+                   const ITensorInfo       *src1,
+                   const ITensorInfo       *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   const GEMMRHSMatrixInfo &rhs_info,
+                   const GEMMKernelInfo    &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
-                           const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info);
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           const GEMMRHSMatrixInfo &rhs_info,
+                           const GEMMKernelInfo    &gemm_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _slide_matrix_b{ true };
-    bool       _reinterpret_input_as_3d{ false };
-    bool       _reinterpret_output_as_3d{ false };
-    bool       _use_dummy_work_items{ false };
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    bool       _has_pad_y{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _slide_matrix_b{true};
+    bool       _reinterpret_input_as_3d{false};
+    bool       _reinterpret_output_as_3d{false};
+    bool       _use_dummy_work_items{false};
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    bool       _has_pad_y{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
index 734f8f9..9a2a489 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp

@@ -23,16 +23,17 @@
  */
 #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -56,23 +57,36 @@
 constexpr int mmul_n0 = 4;
 constexpr int mmul_k0 = 4;
 
-Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+Status validate_arguments(const ITensorInfo       *src0,
+                          const ITensorInfo       *src1,
+                          const ITensorInfo       *src2,
+                          const ITensorInfo       *dst,
+                          float                    alpha,
+                          float                    beta,
+                          const GEMMLHSMatrixInfo &lhs_info,
                           const GEMMRHSMatrixInfo &rhs_info,
                           const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4,
+                                    "The number of dimensions for the LHS matrix must be <= 4");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3,
+                                    "The number of dimensions for the RHS matrix must be <= 3");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1, "Only values greater than 0 are supported for m0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 && rhs_info.n0 != 8 && rhs_info.n0 != 16, "Only 1,2,3,4,8, and 16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 &&
+                                        rhs_info.n0 != 8 && rhs_info.n0 != 16,
+                                    "Only 1,2,3,4,8, and 16 are supported for n0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 != 1 || lhs_info.k0 != 1), "Only 1 is supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.h0 != 4), "Only 4 is supported for h0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true, "Only true is supported for interleave with mmul extension enabled");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false, "Only false is supported for transpose with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true,
+                                    "Only true is supported for interleave with mmul extension enabled");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false,
+                                    "Only false is supported for transpose with mmul extension enabled");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported");
     ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info));
 
@@ -87,7 +101,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k);
 
     // Validate the reinterpreted-as-3D-case
-    if(gemm_info.depth_output_gemm3d != 0)
+    if (gemm_info.depth_output_gemm3d != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m);
     }
@@ -97,9 +111,9 @@
     }
 
     // Validate the gemm-batched case
-    if(src1->num_dimensions() > 2)
+    if (src1->num_dimensions() > 2)
     {
-        if(gemm_info.depth_output_gemm3d != 0)
+        if (gemm_info.depth_output_gemm3d != 0)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(3) != src1->dimension(2));
         }
@@ -109,15 +123,16 @@
         }
     }
 
-    if(src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
+    if (src2 != nullptr && !(helpers::float_ops::is_zero(beta)))
     {
         const unsigned int src2_dim0 = src2->dimension(0);
         const unsigned int src2_dim1 = src2->dimension(1);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1);
-        if(gemm_info.broadcast_bias)
+        if (gemm_info.broadcast_bias)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n),
+                                            "Incorrect dimension of bias matrix which is to be broadcasted");
         }
         else
         {
@@ -125,18 +140,20 @@
         }
     }
 
-    TensorShape tensor_shape1{ src1->tensor_shape() };
+    TensorShape tensor_shape1{src1->tensor_shape()};
     tensor_shape1.set(0, n);
     tensor_shape1.set(1, k);
 
-    const TensorInfo tensor_info1          = src1->clone()->set_tensor_shape(tensor_shape1);
-    const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
+    const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1);
+    const TensorInfo tensor_info_reshaped1 =
+        src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info));
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
+        const TensorInfo tensor_info_dst =
+            dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst);
     }
@@ -144,7 +161,11 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info,
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo             *src0,
+                                                        ITensorInfo             *src1,
+                                                        ITensorInfo             *src2,
+                                                        ITensorInfo             *dst,
+                                                        const GEMMLHSMatrixInfo &lhs_info,
                                                         const GEMMRHSMatrixInfo &rhs_info,
                                                         const GEMMKernelInfo    &gemm_info)
 {
@@ -152,11 +173,12 @@
     bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0;
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     TensorInfo tmp_info(*dst);
 
-    if(reinterpret_output_as_3d)
+    if (reinterpret_output_as_3d)
     {
         // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -204,19 +226,26 @@
     _type = CLKernelType::GEMM;
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha,
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext  &compile_context,
+                                                              ITensorInfo             *src0,
+                                                              ITensorInfo             *src1,
+                                                              ITensorInfo             *src2,
+                                                              ITensorInfo             *dst,
+                                                              float                    alpha,
                                                               float                    beta,
                                                               const GEMMLHSMatrixInfo &lhs_info,
-                                                              const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                              const GEMMRHSMatrixInfo &rhs_info,
+                                                              const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
+    auto_init_if_empty(
+        *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
 
-    auto padding_info   = get_padding_info({ src0, src1, src2, dst });
+    auto padding_info   = get_padding_info({src0, src1, src2, dst});
     _add_bias           = src2 != nullptr;
     _export_to_cl_image = rhs_info.export_to_cl_image;
 
@@ -236,7 +265,8 @@
     // Create build options
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type()));
-    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha));
+    build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)),
+                             "-DALPHA=" + float_to_string_with_full_precision(alpha));
     build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA");
     build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS");
@@ -249,7 +279,8 @@
     build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0));
     build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0));
     build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0));
-    build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
+    build_opts.add_option("-DACTIVATION_TYPE=" +
+                          lower_string(string_from_activation_func(gemm_info.activation_info.activation())));
     build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a()));
     build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b()));
 
@@ -283,37 +314,44 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta,
+Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo       *src0,
+                                                               const ITensorInfo       *src1,
+                                                               const ITensorInfo       *src2,
+                                                               const ITensorInfo       *dst,
+                                                               float                    alpha,
+                                                               float                    beta,
                                                                const GEMMLHSMatrixInfo &lhs_info,
-                                                               const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info)
+                                                               const GEMMRHSMatrixInfo &rhs_info,
+                                                               const GEMMKernelInfo    &gemm_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(),
-                                                              src1->clone().get(),
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(),
                                                               src2 != nullptr ? src2->clone().get() : nullptr,
-                                                              dst->clone().get(),
-                                                              lhs_info,
-                                                              rhs_info,
-                                                              gemm_info)
-                                .first);
+                                                              dst->clone().get(), lhs_info, rhs_info, gemm_info)
+                                    .first);
 
     return Status{};
 }
 
-void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack      &tensors,
+                                                           const Window     &window,
+                                                           cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst);
     ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr);
 
-    if(src1->info()->num_dimensions() < 3)
+    if (src1->info()->num_dimensions() < 3)
     {
         // The stride_z for matrix B must be zero if we do not slice
         ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0);
@@ -321,12 +359,14 @@
 
     cl::Image2D src1_image2d;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2));
+        const TensorShape shape2d(src1->info()->dimension(0) / 4,
+                                  src1->info()->dimension(1) * src1->info()->dimension(2));
         const size_t      image_row_pitch = src1->info()->strides_in_bytes()[1];
 
-        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d,
+                                                  src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     Window slice = window.first_slice_window_3D();
@@ -336,14 +376,14 @@
         unsigned int idx = 0;
 
         add_3d_tensor_nhw_argument(idx, src0);
-        if(_export_to_cl_image)
+        if (_export_to_cl_image)
         {
             _kernel.setArg(idx++, src1_image2d);
         }
         add_3d_tensor_nhw_argument(idx, src1);
 
         // Bias buffer (_add_bias == true)
-        if(_add_bias)
+        if (_add_bias)
         {
             add_3d_tensor_nhw_argument(idx, src2);
         }
@@ -358,8 +398,7 @@
         // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core
         // LWS also enforces the order of execution of the workitems which improves cache utilization
         enqueue(queue, *this, slice, cl::NDRange(32, 2), false);
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
index 59612fc..86d3012 100644
--- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h
+++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,13 @@
      *                             rhs_info.transpose: false
      * @param[in]  gemm_info       GEMM information used to retrieve the original dimensions of the input matrices
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta,
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src0,
+                   ITensorInfo             *src1,
+                   ITensorInfo             *src2,
+                   ITensorInfo             *dst,
+                   float                    alpha,
+                   float                    beta,
                    const GEMMLHSMatrixInfo &lhs_info,
                    const GEMMRHSMatrixInfo &rhs_info,
                    const GEMMKernelInfo    &gemm_info);
@@ -69,7 +76,13 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info,
+    static Status validate(const ITensorInfo       *src0,
+                           const ITensorInfo       *src1,
+                           const ITensorInfo       *src2,
+                           const ITensorInfo       *dst,
+                           float                    alpha,
+                           float                    beta,
+                           const GEMMLHSMatrixInfo &lhs_info,
                            const GEMMRHSMatrixInfo &rhs_info,
                            const GEMMKernelInfo    &gemm_info);
 
@@ -77,11 +90,11 @@
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool       _add_bias{ false };
-    bool       _export_to_cl_image{ false };
-    signed int _m{ 1 };
-    signed int _n{ 1 };
-    signed int _k{ 1 };
+    bool       _add_bias{false};
+    bool       _export_to_cl_image{false};
+    signed int _m{1};
+    signed int _n{1};
+    signed int _k{1};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
index bf4b664..eea2a16 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,13 +47,17 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Status validate_arguments(const ITensorInfo       *src,
+                          const ITensorInfo       *dst,
+                          const GEMMLHSMatrixInfo &lhs_info,
+                          bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3),
+                                    "Only 2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8);
     ARM_COMPUTE_RETURN_ERROR_ON((lhs_info.m0 > 4 && lhs_info.m0 < 8) && lhs_info.transpose);
@@ -60,10 +65,11 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -71,14 +77,15 @@
     return Status{};
 }
 
-Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Window
+configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
 {
     const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0;
     const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0;
 
     TensorInfo tmp_info(*src);
 
-    if(reinterpret_input_as_3d)
+    if (reinterpret_input_as_3d)
     {
         // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave,
         // the window needs to be constructed on the 2D collapsed version of the tensor
@@ -88,10 +95,12 @@
     }
 
     // dst auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)));
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(
+                                 *src, lhs_info, reinterpret_input_as_3d)));
 
     // Configure window
-    Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
     // Collapse along the Z direction
     // This collapse needs to be here in order to tune the Z dimension of LWS
@@ -106,14 +115,18 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMLHSMatrixInfo &lhs_info,
+                                             bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
 
-    auto padding_info = get_padding_info({ src });
+    auto padding_info = get_padding_info({src});
 
     const unsigned int src_w      = src->dimension(0);
     const unsigned int m          = reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1);
@@ -168,7 +181,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d)
+Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMLHSMatrixInfo &lhs_info,
+                                              bool                     reinterpret_input_as_3d)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d));
     return Status{};
@@ -179,8 +195,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -192,8 +209,7 @@
         add_3d_tensor_nhw_argument(idx, src);
         add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
index db88e0d..8e84e8a 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h
+++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h

@@ -57,14 +57,21 @@
      *                                   lhs_info.interleave: true, false
      * @param[in]  reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false);
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMLHSMatrixInfo &lhs_info,
+                   bool                     reinterpret_src_as_3d = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d);
+    static Status validate(const ITensorInfo       *src,
+                           const ITensorInfo       *dst,
+                           const GEMMLHSMatrixInfo &lhs_info,
+                           bool                     reinterpret_src_as_3d);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -72,4 +79,4 @@
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */

diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
index b3a0388..b9ce387 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp

@@ -31,6 +31,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -52,8 +53,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3),
+                                    "Only 2,3,4,8,16 are supported for n0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)),
+                                    "Only 1,2,3,4,8,16 are supported for k0");
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16);
     ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose));
@@ -61,15 +64,17 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
-        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type());
+        const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1,
+                                              src->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info));
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     }
@@ -77,23 +82,27 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
 {
     const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0;
     const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0;
     bool               window_changed                      = false;
 
     // dst auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)));
 
     // Configure window
-    Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
+    Window win =
+        calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
 
-    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y);
+    AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x,
+                                     num_elems_processed_per_iteration_y);
 
     window_changed = update_window_and_padding(win, src_access);
 
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
         gemm::update_padding_for_cl_image(dst);
     }
@@ -102,7 +111,8 @@
     // This collapse needs to be here in order to tune the Z dimension of LWS
     Window collapsed = win.collapse(win, Window::DimZ);
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, collapsed);
 }
 } // namespace
@@ -112,7 +122,10 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext  &compile_context,
+                                             ITensorInfo             *src,
+                                             ITensorInfo             *dst,
+                                             const GEMMRHSMatrixInfo &rhs_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -143,7 +156,9 @@
     _kernel.setArg<cl_int>(idx++, rhs_info.h0);
 }
 
-Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info)
+Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo       *src,
+                                              const ITensorInfo       *dst,
+                                              const GEMMRHSMatrixInfo &rhs_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first);
@@ -156,8 +171,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
@@ -169,9 +185,8 @@
         add_3d_tensor_nhw_argument(idx, src);
         add_3d_tensor_nhw_argument(idx, dst);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice));
+    } while (window.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
index 31eaa46..7203d57 100644
--- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h
+++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h

@@ -66,7 +66,10 @@
      *                             rhs_info.transpose: true, false
      *                             rhs_info.interleave: true, false
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info);
+    void configure(const ClCompileContext  &compile_context,
+                   ITensorInfo             *src,
+                   ITensorInfo             *dst,
+                   const GEMMRHSMatrixInfo &rhs_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure()
@@ -81,4 +84,4 @@
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */

diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
index 719201d..2e1cefc 100644
--- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp

@@ -30,10 +30,10 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,7 +52,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY));
 
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0));
-    for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -62,8 +62,7 @@
 }
 } // namespace
 
-ClHeightConcatenateKernel::ClHeightConcatenateKernel()
-    : _height_offset(0)
+ClHeightConcatenateKernel::ClHeightConcatenateKernel() : _height_offset(0)
 {
     _type = CLKernelType::ELEMENTWISE;
 }
@@ -74,12 +73,15 @@
     return Status{};
 }
 
-void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst)
+void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                          ITensorInfo            *src,
+                                          unsigned int            height_offset,
+                                          ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     _height_offset = height_offset;
 
@@ -90,9 +92,10 @@
     build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
 
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -125,8 +128,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);

diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
index d3c077f..5a391a1 100644
--- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h

@@ -50,7 +50,8 @@
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClHeightConcatenateKernel::configure()
@@ -64,7 +65,7 @@
 
 private:
     unsigned int _height_offset;
-    int32_t      _depth{ 0 };
+    int32_t      _depth{0};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp
index e890847..ef7a528 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.cpp
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp

@@ -29,9 +29,10 @@
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -60,13 +61,19 @@
     bool                  is_padding_required_nchw{};
 };
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                          unsigned int num_groups)
+Status validate_arguments(const ITensorInfo   *src,
+                          const ITensorInfo   *dst,
+                          const Size2D        &kernel_dims,
+                          const PadStrideInfo &conv_info,
+                          bool                 has_bias,
+                          const Size2D        &dilation,
+                          unsigned int         num_groups)
 {
     const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL);
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
     ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1));
@@ -82,9 +89,10 @@
     const unsigned     total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom();
     ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height));
 
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
-        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
+        const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(
+            compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -93,13 +101,21 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                                        unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo         *src,
+                                                        ITensorInfo         *dst,
+                                                        const Size2D        &kernel_dims,
+                                                        const PadStrideInfo &conv_info,
+                                                        bool                 has_bias,
+                                                        const Size2D        &dilation,
+                                                        unsigned int         num_elems_processed_per_iteration,
+                                                        bool                 is_padding_required_nchw,
+                                                        unsigned int         num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialization if not yet initialized
-    TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
+    TensorShape expected_output_shape =
+        compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups);
 
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape));
 
@@ -113,22 +129,22 @@
     bool   window_changed = false;
     Window win;
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration));
     }
     else
     {
-        if(is_padding_required_nchw)
+        if (is_padding_required_nchw)
         {
-            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left());
-            win = calculate_max_window(*src,
-                                       Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
-            AccessWindowStatic input_access(src,
-                                            -border.left,
-                                            -border.top,
-                                            ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
-                                            input_height + border.bottom);
+            const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(),
+                                    conv_info.pad_left());
+            win = calculate_max_window(
+                *src, Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second));
+            AccessWindowStatic input_access(
+                src, -border.left, -border.top,
+                ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration),
+                input_height + border.bottom);
             window_changed = window_changed || update_window_and_padding(win, input_access);
         }
         else
@@ -142,11 +158,17 @@
     // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension
     win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start());
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 
-Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups)
+Im2ColConfiguration configure_opencl_kernel(const ITensorInfo   *src,
+                                            const Size2D        &kernel_dims,
+                                            const PadStrideInfo &conv_info,
+                                            bool                 has_bias,
+                                            const Size2D        &dilation,
+                                            unsigned int         num_groups)
 {
     const DataLayout   data_layout   = src->data_layout();
     const DataType     data_type     = src->data_type();
@@ -157,7 +179,8 @@
     const unsigned int input_height  = src->dimension(height_idx);
     const unsigned int input_channel = src->dimension(channel_idx);
 
-    const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    const std::pair<unsigned int, unsigned int> convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
 
     // Im2Col configuration
     std::string                   kernel_name = "im2col_generic_";
@@ -184,21 +207,22 @@
     build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x()));
     build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y()));
     build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups));
-    build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
+    build_opts.add_option_if_else(is_data_type_quantized(data_type),
+                                  "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0");
     build_opts.add_option_if(has_bias, "-DHAS_BIAS");
 
-    if(data_layout == DataLayout::NHWC)
+    if (data_layout == DataLayout::NHWC)
     {
         num_elems_processed_per_iteration = std::min(2U, input_channel);
         is_padding_required_nchw          = false;
 
         // Only the 3x3 and 9x9 cases are optimized for NHWC
-        if(kernel_dims == Size2D(3U, 3U))
+        if (kernel_dims == Size2D(3U, 3U))
         {
             kernel_name = "im2col3x3_";
             build_opts.add_option("-DIM2COL_3X3");
         }
-        else if(kernel_dims == Size2D(9U, 9U))
+        else if (kernel_dims == Size2D(9U, 9U))
         {
             kernel_name = "im2col9x9_";
             build_opts.add_option("-DIM2COL_9X9");
@@ -219,17 +243,17 @@
     }
     else
     {
-        if(dilation == Size2D(1U, 1U))
+        if (dilation == Size2D(1U, 1U))
         {
             const bool squared_im2col = kernel_dims.width == kernel_dims.height;
-            if(squared_im2col)
+            if (squared_im2col)
             {
                 // Check if we can run an optimized im2col for NCHW
-                switch(kernel_dims.width)
+                switch (kernel_dims.width)
                 {
                     case 1:
                         // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false
-                        if(conv_info.stride().first == 1 && !conv_info.has_padding())
+                        if (conv_info.stride().first == 1 && !conv_info.has_padding())
                         {
                             kernel_name                       = "im2col1x1_stridex1_";
                             num_elems_processed_per_iteration = 4;
@@ -248,7 +272,7 @@
                         break;
                     case 11:
                         // Optimized im2col11x11 if pad_x = pad_y = 0
-                        if(!conv_info.has_padding())
+                        if (!conv_info.has_padding())
                         {
                             kernel_name                       = "im2col11x11_padx0_pady0_";
                             num_elems_processed_per_iteration = 1;
@@ -262,7 +286,7 @@
                         break;
                 }
             }
-            else if(kernel_dims.width > 1 && !conv_info.has_padding())
+            else if (kernel_dims.width > 1 && !conv_info.has_padding())
             {
                 kernel_name                       = "im2col_generic_padx0_pady0_";
                 num_elems_processed_per_iteration = 1;
@@ -297,19 +321,29 @@
 } // namespace
 
 ClIm2ColKernel::ClIm2ColKernel()
-    : _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups()
+    : _data_layout(DataLayout::UNKNOWN),
+      _convolved_dims(),
+      _num_elems_processed_per_iteration(1),
+      _kernel_dims(),
+      _conv_info(),
+      _num_groups()
 {
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                               const Size2D &dilation,
-                               unsigned int  num_groups)
+void ClIm2ColKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const Size2D           &kernel_dims,
+                               const PadStrideInfo    &conv_info,
+                               bool                    has_bias,
+                               const Size2D           &dilation,
+                               unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
     _data_layout      = src->data_layout();
 
     const unsigned int width_idx    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
@@ -320,19 +354,22 @@
     // Select and configure the optimal OpenCL kernel to run.
     // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration
     // and the padding requirement flag
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
 
     // Create kernel
     _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options);
 
-    _convolved_dims                    = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
+    _convolved_dims =
+        scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation);
     _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration;
     _kernel_dims                       = kernel_dims; // Only needed by the Tuner
     _conv_info                         = conv_info;   // Only needed by the Tuner
     _num_groups                        = num_groups;
 
     // Configure kernel window
-    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+    auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation,
+                                                    im2col_config.num_elems_processed_per_iteration,
                                                     im2col_config.is_padding_required_nchw, num_groups);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
@@ -353,14 +390,22 @@
     ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info));
 }
 
-Status ClIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation,
-                                unsigned int num_groups)
+Status ClIm2ColKernel::validate(const ITensorInfo   *src,
+                                const ITensorInfo   *dst,
+                                const Size2D        &kernel_dims,
+                                const PadStrideInfo &conv_info,
+                                bool                 has_bias,
+                                const Size2D        &dilation,
+                                unsigned int         num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration,
+    Im2ColConfiguration im2col_config =
+        configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims,
+                                                              conv_info, has_bias, dilation,
+                                                              im2col_config.num_elems_processed_per_iteration,
                                                               im2col_config.is_padding_required_nchw, num_groups)
-                                .first);
+                                    .first);
     return Status{};
 }
 
@@ -388,7 +433,7 @@
     Window slice_in  = first_slice_3d;
     Window slice_out = window_output.first_slice_window_2D();
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         const Window tmp_win     = window.collapse_if_possible(ICLKernel::window(), 3);
         const int    num_batches = tmp_win[3].end();
@@ -398,7 +443,10 @@
     }
     else
     {
-        slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration));
+        slice.set(0,
+                  Window::Dimension(
+                      0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)),
+                      _num_elems_processed_per_iteration));
         slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1));
         // Note: In case of NCHW the 3rd dimension is already set collapsing the input window
     }
@@ -414,14 +462,16 @@
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
+    unsigned int idx = num_arguments_per_3D_tensor() +
+                       (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor());
     _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3]));
-    _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
+    _kernel.setArg<cl_uint>(idx++,
+                            static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)]));
     do
     {
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src, slice_in);
-        if(_num_groups == 1)
+        if (_num_groups == 1)
         {
             add_2D_tensor_argument(idx, dst, slice_out);
         }
@@ -430,8 +480,8 @@
             add_3D_tensor_argument(idx, dst, slice_out);
         }
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in));
+    } while (window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) &&
+             window_collapsed.slide_window_slice_3D(slice_in));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.h b/src/gpu/cl/kernels/ClIm2ColKernel.h
index a637ad2..c8cd5b3 100644
--- a/src/gpu/cl/kernels/ClIm2ColKernel.h
+++ b/src/gpu/cl/kernels/ClIm2ColKernel.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Size2D.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -77,28 +78,38 @@
      * @param[in]  dilation        (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias,
-                   const Size2D &dilation   = Size2D(1U, 1U),
-                   unsigned int  num_groups = 1);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const Size2D           &kernel_dims,
+                   const PadStrideInfo    &conv_info,
+                   bool                    has_bias,
+                   const Size2D           &dilation   = Size2D(1U, 1U),
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIm2ColKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U),
-                           unsigned int num_groups = 1);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *output,
+                           const Size2D        &kernel_dims,
+                           const PadStrideInfo &conv_info,
+                           bool                 has_bias,
+                           const Size2D        &dilation   = Size2D(1U, 1U),
+                           unsigned int         num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
-    DataLayout _data_layout;
+    DataLayout                            _data_layout;
     std::pair<unsigned int, unsigned int> _convolved_dims;
-    unsigned int  _num_elems_processed_per_iteration;
-    Size2D        _kernel_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _num_groups;
+    unsigned int                          _num_elems_processed_per_iteration;
+    Size2D                                _kernel_dims;
+    PadStrideInfo                         _conv_info;
+    unsigned int                          _num_groups;
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
index d291fad..8c493d0 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,26 +44,29 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                                                 src->data_layout(),
-                                                                                                                 weights->tensor_shape(),
-                                                                                                                 conv_info,
-                                                                                                                 desc));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(),
+            misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                                  weights->tensor_shape(), conv_info, desc));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32);
     }
 
@@ -75,8 +79,12 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
-                                                            const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext            &compile_context,
+                                                            ITensorInfo                       *src,
+                                                            ITensorInfo                       *weights,
+                                                            ITensorInfo                       *dst,
+                                                            const PadStrideInfo               &conv_info,
+                                                            const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info, desc));
@@ -85,11 +93,8 @@
     constexpr unsigned int height_idx = 2;
 
     // Get dst shape
-    TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                     src->data_layout(),
-                                                                                     weights->tensor_shape(),
-                                                                                     conv_info,
-                                                                                     desc);
+    TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
 
     TensorShape output_conv_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
@@ -136,14 +141,19 @@
     // Since this kernel should be called only once, we do not need to set the config_id for tuning
 }
 
-Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                                                             const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc)
+Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo                 *src,
+                                                             const ITensorInfo                 *weights,
+                                                             const ITensorInfo                 *dst,
+                                                             const PadStrideInfo               &conv_info,
+                                                             const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info, desc));
     return Status{};
 }
 
-void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack      &tensors,
+                                                         const Window     &window,
+                                                         cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);

diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
index ff7f4be..b565609 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h
+++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h

@@ -60,16 +60,23 @@
      * @param[in]  conv_info       Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC direct/indirect convolution kernel.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIndirectConv2dAddressPreCalculationKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
index a337eb5..3510b69 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp

@@ -23,13 +23,14 @@
  */
 #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLUtils.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -46,8 +47,14 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst,
-                          const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status validate_arguments(const ITensorInfo                 *src,
+                          const ITensorInfo                 *weights,
+                          const ITensorInfo                 *biases,
+                          const ITensorInfo                 *indirect_buffer,
+                          const ITensorInfo                 *dst,
+                          const PadStrideInfo               &conv_info,
+                          const ActivationLayerInfo         &act_info,
+                          const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
@@ -55,37 +62,38 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indirect_buffer, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(indirect_buffer->tensor_shape(),
-                                                       misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                                             src->data_layout(),
-                                                                                                             weights->tensor_shape(),
-                                                                                                             conv_info,
-                                                                                                             desc));
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+        indirect_buffer->tensor_shape(),
+        misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(),
+                                                              weights->tensor_shape(), conv_info, desc));
 
     constexpr int channel_idx = 0;
     constexpr int batch_idx   = 3;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8,
+                                    "M0 can only be greater than 0 and less than or equal to 8");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 &&
+                                        desc.n0 != 16,
                                     "N0 can only be: 1, 2, 3, 4, 8, and 16");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 &&
+                                        desc.k0 != 16,
                                     "K0 can only be: 1, 2, 3, 4, 8, and 16");
 
-    if(desc.export_weights_to_cl_image)
+    if (desc.export_weights_to_cl_image)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16,
-                                        "K0 can only be: 4, 8, and 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights),
                                         "Export to CLImage is not supported for this weight configuration");
     }
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(src->data_type()))
+        if (is_data_type_quantized_asymmetric(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -95,15 +103,14 @@
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx),
                                         "Biases size and number of dst feature maps should match");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
-                                        "Biases should be one dimensional");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional");
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(),
-                                                           misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     }
 
@@ -116,13 +123,21 @@
     _type = CLKernelType::DIRECT;
 }
 
-void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *indirect_buffer, ITensorInfo *dst,
-                                       const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+void ClIndirectConv2dKernel::configure(const CLCompileContext            &compile_context,
+                                       ITensorInfo                       *src,
+                                       ITensorInfo                       *weights,
+                                       ITensorInfo                       *biases,
+                                       ITensorInfo                       *indirect_buffer,
+                                       ITensorInfo                       *dst,
+                                       const PadStrideInfo               &conv_info,
+                                       const ActivationLayerInfo         &act_info,
+                                       const DirectConvComputeKernelInfo &desc)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, indirect_buffer, dst);
 
     // Perform validation
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
 
     constexpr unsigned int channel_idx   = 0;
     constexpr unsigned int width_idx     = 1;
@@ -137,10 +152,7 @@
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, output_shape,
-                       1,
-                       src->data_type(),
-                       src->quantization_info());
+    auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info());
 
     // Configure kernel window
     Window win;
@@ -164,7 +176,7 @@
     _export_to_cl_image = desc.export_weights_to_cl_image;
 
     // Update the padding for the weights tensor if we can export to cl_image
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
         gemm::update_padding_for_cl_image(weights);
     }
@@ -173,11 +185,12 @@
     // When M0 is 5, 6, and 7, we use vload8 to fetch the data from the buffer
     const unsigned int load_indirect_buf_size = m0 > 4 ? 8 : m0;
     const unsigned int indirect_buf_width     = indirect_buffer->tensor_shape()[0];
-    const unsigned int round_up_width         = ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
-    const unsigned int padding                = round_up_width - indirect_buf_width;
+    const unsigned int round_up_width =
+        ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size;
+    const unsigned int padding = round_up_width - indirect_buf_width;
     indirect_buffer->extend_padding(PaddingSize(0, indirect_buffer->padding().right + padding, 0, 0));
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -186,9 +199,10 @@
     // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324
     const auto act_function = act_info.activation();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (data_type == DataType::F32 || data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (data_type == DataType::F32 || data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -224,7 +238,7 @@
     // A macro guard to compile ONLY the kernel of interest
     build_options.add_option("-D" + upper_string(kernel_name.str()));
 
-    if(compile_context.get_ddk_version() >= 30)
+    if (compile_context.get_ddk_version() >= 30)
     {
         build_options.add_option("-fregister-allocation=64");
     }
@@ -253,10 +267,17 @@
     _config_id += support::cpp11::to_string(dst->dimension(channel_idx));
 }
 
-Status ClIndirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst,
-                                        const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc)
+Status ClIndirectConv2dKernel::validate(const ITensorInfo                 *src,
+                                        const ITensorInfo                 *weights,
+                                        const ITensorInfo                 *biases,
+                                        const ITensorInfo                 *indirect_buffer,
+                                        const ITensorInfo                 *dst,
+                                        const PadStrideInfo               &conv_info,
+                                        const ActivationLayerInfo         &act_info,
+                                        const DirectConvComputeKernelInfo &desc)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc));
     return Status{};
 }
 
@@ -268,35 +289,42 @@
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src             = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights         = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases          = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    const auto indirect_buffer = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
-    auto       dst             = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    const auto indirect_buffer =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     cl::Image2D weights_cl_image;
 
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
-        const size_t      image_w = weights->info()->dimension(0) / 4;
-        const size_t      image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
+        const size_t image_w = weights->info()->dimension(0) / 4;
+        const size_t image_h =
+            weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3);
         const TensorShape shape2d(image_w, image_h);
         const size_t      image_row_pitch = weights->info()->strides_in_bytes()[1];
 
         // Export cl_buffer to cl_image
-        weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        weights_cl_image =
+            create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d,
+                                       weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
     }
 
     unsigned int idx = 0;
     add_4d_tensor_nhwc_argument(idx, src);
     add_4d_tensor_nhwc_argument(idx, indirect_buffer);
     add_4d_tensor_nhwc_argument(idx, dst);
-    if(_export_to_cl_image)
+    if (_export_to_cl_image)
     {
         _kernel.setArg(idx++, weights_cl_image);
     }
     add_4d_tensor_nhwc_argument(idx, weights);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }

diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
index b6c7b35..04166d4 100644
--- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h
+++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -60,22 +61,35 @@
      * @param[in]  act_info        Contains activaton information described in @ref ActivationLayerInfo.
      * @param[in]  desc            Direct convolution descriptor used to build the NHWC indirect convolution kernel.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *off, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                   const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    void configure(const CLCompileContext            &compile_context,
+                   ITensorInfo                       *src,
+                   ITensorInfo                       *off,
+                   ITensorInfo                       *weights,
+                   ITensorInfo                       *biases,
+                   ITensorInfo                       *dst,
+                   const PadStrideInfo               &conv_info,
+                   const ActivationLayerInfo         &act_info,
+                   const DirectConvComputeKernelInfo &desc);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClIndirectConv2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *off, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc);
+    static Status validate(const ITensorInfo                 *src,
+                           const ITensorInfo                 *off,
+                           const ITensorInfo                 *weights,
+                           const ITensorInfo                 *biases,
+                           const ITensorInfo                 *dst,
+                           const PadStrideInfo               &conv_info,
+                           const ActivationLayerInfo         &act_info,
+                           const DirectConvComputeKernelInfo &desc);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
-    bool _export_to_cl_image{ false };
+    bool _export_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
index 66331bc..0bb6b0c 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp

@@ -29,17 +29,16 @@
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -62,51 +61,62 @@
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
-    if(!adj_lhs || adj_rhs)
+    if (!adj_lhs || adj_rhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
     }
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
-Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulLowpNativeKernel::validate(const ITensorInfo         *lhs,
+                                          const ITensorInfo         *rhs,
+                                          const ITensorInfo         *bias,
+                                          const ITensorInfo         *dst,
+                                          const MatMulKernelInfo    &matmul_kernel_info,
                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY && act_info.activation() != ActivationFunction::RELU
-                                     && act_info.activation() != ActivationFunction::LU_BOUNDED_RELU && act_info.activation() != ActivationFunction::BOUNDED_RELU),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY &&
+                                     act_info.activation() != ActivationFunction::RELU &&
+                                     act_info.activation() != ActivationFunction::LU_BOUNDED_RELU &&
+                                     act_info.activation() != ActivationFunction::BOUNDED_RELU),
                                     "Activation Function specified is unsupported.");
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -115,7 +125,12 @@
 
     return Status{};
 }
-void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+void ClMatMulLowpNativeKernel::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *lhs,
+                                         ITensorInfo               *rhs,
+                                         ITensorInfo               *bias,
+                                         ITensorInfo               *dst,
+                                         const MatMulKernelInfo    &matmul_kernel_info,
                                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
@@ -123,7 +138,8 @@
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int  m       = dst->dimension(1);
     const int  n       = dst->dimension(0);
@@ -217,10 +233,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
 
@@ -229,7 +248,7 @@
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }

diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
index 64415f4..ffdb720 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -54,7 +55,12 @@
      * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
      * @param[in]  act_info           (Optional) Class containing information about fused activation function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -62,7 +68,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:

diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
index 464212d..94e3c4e 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp

@@ -28,10 +28,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -64,13 +64,15 @@
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 4), "Only 4 is supported for k0");
@@ -84,7 +86,11 @@
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo         *lhs,
+                                              const ITensorInfo         *rhs,
+                                              const ITensorInfo         *bias,
+                                              const ITensorInfo         *dst,
+                                              const MatMulKernelInfo    &matmul_kernel_info,
                                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
@@ -102,16 +108,17 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY),
                                     "Activation Function specified is unsupported.");
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
@@ -121,15 +128,21 @@
     return Status{};
 }
 
-void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst,
-                                             const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info)
+void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext    &compile_context,
+                                             ITensorInfo               *lhs,
+                                             ITensorInfo               *rhs,
+                                             ITensorInfo               *bias,
+                                             ITensorInfo               *dst,
+                                             const MatMulKernelInfo    &matmul_kernel_info,
+                                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info, act_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     ARM_COMPUTE_UNUSED(compile_context, lhs, rhs, bias, matmul_kernel_info, act_info);
     CLBuildOptions build_opts;
@@ -147,7 +160,8 @@
     const unsigned int n0_leftover = n % n0;
 
     // Configure kernel window
-    const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -215,10 +229,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    auto       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    auto *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
@@ -227,7 +244,7 @@
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }

diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
index d2aa40b..6c56f15 100644
--- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -49,7 +50,12 @@
      *
      * @return a status
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -57,7 +63,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:

diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
index 41ba5d5..a1fa9fa 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp

@@ -28,9 +28,9 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/utils/ActivationFunctionUtils.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/CL/CLUtils.h"
@@ -38,7 +38,6 @@
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -61,20 +60,23 @@
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0");
-    if(!adj_lhs || adj_rhs)
+    if (!adj_lhs || adj_rhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16),
+                                        "Only 1,2,3,4,8,16 are supported for K0");
     }
 
     return Status{};
@@ -83,30 +85,37 @@
 Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings());
-    if(matmul_kernel_info.export_rhs_to_cl_image)
+    if (matmul_kernel_info.export_rhs_to_cl_image)
     {
-        if(matmul_kernel_info.adj_rhs)
+        if (matmul_kernel_info.adj_rhs)
         {
             const int k0 = matmul_kernel_info.k0;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, "K0 can only be: 4, 8, and 16 for Rhs transposed");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16,
+                                            "K0 can only be: 4, 8, and 16 for Rhs transposed");
         }
         else
         {
             const int n0 = matmul_kernel_info.n0;
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16,
+                                            "N0 can only be: 4, 8, and 16 for Rhs non-transposed");
         }
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs),
+                                        "Export to CLImage is not supported for this device/configuration");
     }
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulNativeKernel::ClMatMulNativeKernel()
 {
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+Status ClMatMulNativeKernel::validate(const ITensorInfo         *lhs,
+                                      const ITensorInfo         *rhs,
+                                      const ITensorInfo         *bias,
+                                      const ITensorInfo         *dst,
+                                      const MatMulKernelInfo    &matmul_kernel_info,
                                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
@@ -114,28 +123,36 @@
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info));
 
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, lhs);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
     }
 
     return Status{};
 }
-void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+void ClMatMulNativeKernel::configure(const ClCompileContext    &compile_context,
+                                     ITensorInfo               *lhs,
+                                     ITensorInfo               *rhs,
+                                     ITensorInfo               *bias,
+                                     ITensorInfo               *dst,
+                                     const MatMulKernelInfo    &matmul_kernel_info,
                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info);
@@ -143,7 +160,8 @@
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int  m       = dst->dimension(1);
     const int  n       = dst->dimension(0);
@@ -187,7 +205,7 @@
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
 
-    if(_export_rhs_to_cl_image)
+    if (_export_rhs_to_cl_image)
     {
         gemm::update_padding_for_cl_image(rhs);
     }
@@ -222,10 +240,13 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
 
@@ -235,7 +256,7 @@
     add_3d_tensor_nhw_argument(idx, lhs);
 
     cl::Image2D rhs_cl_image;
-    if(_export_rhs_to_cl_image)
+    if (_export_rhs_to_cl_image)
     {
         const size_t      image_w = rhs->info()->dimension(0) / 4;
         const size_t      image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0);
@@ -243,12 +264,13 @@
         const size_t      image_row_pitch = rhs->info()->strides_in_bytes()[1];
 
         // Export cl_buffer to cl_image
-        rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
+        rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d,
+                                                  rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly);
         _kernel.setArg(idx++, rhs_cl_image);
     }
 
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }

diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h
index fe2b787..2cb150b 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h

@@ -25,6 +25,7 @@
 #define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -52,7 +53,12 @@
      * @param[in]  matmul_kernel_info Attributes for Batch MatMul Kernel
      * @param[in]  act_info           (Optional) Specifies activation function to use after Matrix multiplication. Default is Identity function.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const MatMulKernelInfo    &matmul_kernel_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -60,14 +66,18 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const MatMulKernelInfo    &matmul_kernel_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    bool _export_rhs_to_cl_image{ false };
+    bool _export_rhs_to_cl_image{false};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
index 432270e..76bf846 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp

@@ -28,14 +28,13 @@
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/StringUtils.h"
 
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -62,31 +61,38 @@
     // Validate M0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0");
 
-    if(adj_lhs)
+    if (adj_lhs)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16),
+                                        "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed");
     }
 
     // Validate N0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16),
+                                    "Only 1,2,3,4,8,16 are supported for N0");
 
     // Validate K0
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 1), "Only 1 is supported for k0");
 
     return Status{};
 }
-}
+} // namespace
 ClMatMulNativeMMULKernel::ClMatMulNativeMMULKernel()
 {
     _type = CLKernelType::GEMM;
 }
 
-Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+Status ClMatMulNativeMMULKernel::validate(const ITensorInfo      *lhs,
+                                          const ITensorInfo      *rhs,
+                                          const ITensorInfo      *bias,
+                                          const ITensorInfo      *dst,
+                                          const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()),
+                                    "The extension cl_arm_matrix_multiply is not supported on the target platform");
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info));
 
@@ -96,32 +102,40 @@
     const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
     ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0);
 
-    const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
+    const TensorShape expected_output_shape =
+        misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info);
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst);
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0],
+                                        "First dimension of bias and output tensors must match.");
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, bias);
     }
 
     return Status{};
 }
-void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info)
+void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context,
+                                         ITensorInfo            *lhs,
+                                         ITensorInfo            *rhs,
+                                         ITensorInfo            *bias,
+                                         ITensorInfo            *dst,
+                                         const MatMulKernelInfo &matmul_kernel_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info);
     ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info));
 
     // dst tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
+    auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(
+                                 lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)));
 
     const int m = dst->dimension(1);
     const int n = dst->dimension(0);
@@ -135,7 +149,8 @@
     const int n0 = adjust_vec_size(matmul_kernel_info.n0, n);
 
     // Configure kernel window
-    const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
+    const auto win_config =
+        validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
@@ -186,17 +201,20 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const ICLTensor *lhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const ICLTensor *rhs  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
-    ICLTensor       *dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const ICLTensor *lhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const ICLTensor *rhs =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(
+        tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present
+    ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst);
     unsigned int idx = 0;
 
     add_3d_tensor_nhw_argument(idx, lhs);
     add_3d_tensor_nhw_argument(idx, rhs);
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         add_3d_tensor_nhw_argument(idx, bias);
     }

diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
index 8044897..1aeb896 100644
--- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h
+++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h

@@ -72,22 +72,31 @@
      * @param[out] dst             Output tensor info.
      * @param[in]  matmul_info     Attributes for Batch MatMul Kernel
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *lhs,
+                   ITensorInfo            *rhs,
+                   ITensorInfo            *bias,
+                   ITensorInfo            *dst,
+                   const MatMulKernelInfo &matmul_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMatMulNativeMMULKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info);
+    static Status validate(const ITensorInfo      *lhs,
+                           const ITensorInfo      *rhs,
+                           const ITensorInfo      *bias,
+                           const ITensorInfo      *dst,
+                           const MatMulKernelInfo &matmul_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    int _m{ 1 };
-    int _n{ 1 };
-    int _k{ 1 };
+    int _m{1};
+    int _n{1};
+    int _k{1};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClMulKernel.cpp b/src/gpu/cl/kernels/ClMulKernel.cpp
index 5ca0639..3b59c2a 100644
--- a/src/gpu/cl/kernels/ClMulKernel.cpp
+++ b/src/gpu/cl/kernels/ClMulKernel.cpp

@@ -23,15 +23,16 @@
  */
 #include "src/gpu/cl/kernels/ClMulKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -46,24 +47,25 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *src1,
+                          const ITensorInfo         *src2,
+                          const ITensorInfo         *dst,
+                          float                      scale,
+                          ConvertPolicy              overflow_policy,
+                          RoundingPolicy             rounding_policy,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(overflow_policy);
     ARM_COMPUTE_UNUSED(rounding_policy);
 
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2,
-                                                         1,
-                                                         DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                         DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32,
-                                                         DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                         DataType::F16, DataType::S32, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative.");
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
@@ -76,27 +78,35 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst,
-                                                             1,
-                                                             DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
-                                                             DataType::S16, DataType::QSYMM16, DataType::F16,
-                                                             DataType::S32, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8,
+                                                             DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16,
+                                                             DataType::F16, DataType::S32, DataType::F32);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 &&
+                                            (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8),
                                         "Dst can only be U8 if both src are U8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
-                                        "Dst can only be QASYMM8 if both src are QASYMM8");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
-                                        "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
-                                        "Dst can only be QSYMM16 if both src are QSYMM16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32),
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8 &&
+                (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8),
+            "Dst can only be QASYMM8 if both src are QASYMM8");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QASYMM8_SIGNED &&
+                (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED),
+            "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            dst->data_type() == DataType::QSYMM16 &&
+                (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16),
+            "Dst can only be QSYMM16 if both src are QSYMM16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) &&
+                                            (dst->data_type() != DataType::S32),
                                         "Dst must be S32 if source tensors are S32");
-        if(in_place)
+        if (in_place)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
-                                            "Wrong shape for dst, cannot do in_place calculation");
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+                detail::have_different_dimensions(out_shape,
+                                                  src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0),
+                "Wrong shape for dst, cannot do in_place calculation");
         }
         else
         {
@@ -114,14 +124,19 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                            ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void ClMulKernel::configure(const CLCompileContext    &compile_context,
+                            ITensorInfo               *src1,
+                            ITensorInfo               *src2,
+                            ITensorInfo               *dst,
+                            float                      scale,
+                            ConvertPolicy              overflow_policy,
+                            RoundingPolicy             rounding_policy,
+                            const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst,
-                                                  scale, overflow_policy, rounding_policy, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
     auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
@@ -133,7 +148,7 @@
     // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15
     // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14
     // Moreover, it will be negative as we deal with 1/2^n
-    if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
+    if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1))
     {
         // Store the positive exponent. We know that we compute 1/2^n
         // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5
@@ -142,19 +157,19 @@
 
     std::string acc_type;
     // Check if it has float src and dst
-    if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
+    if (is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type()))
     {
         scale_int = -1;
         acc_type  = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half";
     }
     else
     {
-        if(src1->element_size() == 4 || src2->element_size() == 4)
+        if (src1->element_size() == 4 || src2->element_size() == 4)
         {
             // use 64 bit accumulator for 32-bit input
             acc_type = "long";
         }
-        else if(src1->element_size() == 2 || src2->element_size() == 2)
+        else if (src1->element_size() == 2 || src2->element_size() == 2)
         {
             // Use 32-bit accumulator for 16-bit input
             acc_type = "int";
@@ -176,11 +191,15 @@
     build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type()));
     build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type()));
-    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
-    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
+    build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1)
+                                                   ? "1"
+                                                   : support::cpp11::to_string(vec_size)));
     build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
-    if(is_quantized && (dst->data_type() != DataType::S32))
+    if (is_quantized && (dst->data_type() != DataType::S32))
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -200,12 +219,14 @@
     else
     {
         kernel_name += (scale_int >= 0) ? "_int" : "_float";
-        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE");
+        build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()),
+                                      "-DWRAP", "-DSATURATE");
         build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte");
         build_opts.add_option("-DACC_DATA_TYPE=" + acc_type);
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
-            build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
+            build_opts.add_option("-DACTIVATION_TYPE=" +
+                                  lower_string(string_from_activation_func(act_info.activation())));
             build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
             build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
         }
@@ -223,7 +244,7 @@
     // Set scale argument
     unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
 
-    if(scale_int >= 0 && !is_quantized)
+    if (scale_int >= 0 && !is_quantized)
     {
         _kernel.setArg(idx++, scale_int);
     }
@@ -261,8 +282,13 @@
     _config_id += support::cpp11::to_string(dst->dimension(2));
 }
 
-Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                             ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status ClMulKernel::validate(const ITensorInfo         *src1,
+                             const ITensorInfo         *src2,
+                             const ITensorInfo         *dst,
+                             float                      scale,
+                             ConvertPolicy              overflow_policy,
+                             RoundingPolicy             rounding_policy,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info));
@@ -275,9 +301,11 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst);
 
@@ -286,17 +314,18 @@
     const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -312,7 +341,7 @@
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, src_0, slice_input1);
         add_3D_tensor_argument(idx, src_1, slice_input2);
-        if(!in_place)
+        if (!in_place)
         {
             add_3D_tensor_argument(idx, dst, slice);
         }
@@ -320,15 +349,17 @@
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 
 namespace
 {
 constexpr unsigned int vec_size_complex = 1;
 
-Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status validate_arguments_complex(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32);
@@ -340,11 +371,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type()));
 
     // Validate in case of configured dst
-    if(dst->total_size() > 0)
+    if (dst->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0),
+                                        "Wrong shape for dst");
     }
 
     return Status{};
@@ -356,19 +388,23 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClComplexMulKernel::configure(const CLCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape());
     auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape));
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-    if(act_info.enabled())
+    if (act_info.enabled())
     {
         build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation())));
         build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
@@ -384,7 +420,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClComplexMulKernel::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info));
@@ -397,26 +436,29 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    auto       dst   = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src_0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto src_1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     const TensorShape &in_shape1 = src_0->info()->tensor_shape();
     const TensorShape &in_shape2 = src_1->info()->tensor_shape();
     const TensorShape &out_shape = dst->info()->tensor_shape();
 
     bool can_collapse = true;
-    if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
+    if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1)
     {
         can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ);
-        for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
+        for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d)
         {
             can_collapse = (in_shape1[d] == in_shape2[d]);
         }
     }
 
     bool   has_collapsed = false;
-    Window collapsed     = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
+    Window collapsed =
+        can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window;
 
     const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1;
     const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2;
@@ -435,8 +477,7 @@
 
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1));
         ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2));
-    }
-    while(collapsed.slide_window_slice_3D(slice));
+    } while (collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClMulKernel.h b/src/gpu/cl/kernels/ClMulKernel.h
index 4e62a6d..76a3ce0 100644
--- a/src/gpu/cl/kernels/ClMulKernel.h
+++ b/src/gpu/cl/kernels/ClMulKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_MUL_KERNEL_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -72,16 +73,27 @@
      * @param[in]  rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -101,14 +113,21 @@
      * @param[out] dst             The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClComplexMulKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClPermuteKernel.cpp b/src/gpu/cl/kernels/ClPermuteKernel.cpp
index 8d46551..a475578 100644
--- a/src/gpu/cl/kernels/ClPermuteKernel.cpp
+++ b/src/gpu/cl/kernels/ClPermuteKernel.cpp

@@ -29,8 +29,9 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,13 +61,13 @@
                                     "Permutation up to 4-D src tensor is supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4,
                                     "Permutation vector size should be less than or equal to 4");
-    for(const auto &p : perm)
+    for (const auto &p : perm)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values");
     }
 
     // Validate configured dst
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape);
@@ -82,10 +83,13 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+void ClPermuteKernel::configure(const CLCompileContext  &compile_context,
+                                const ITensorInfo       *src,
+                                ITensorInfo             *dst,
+                                const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
-    auto              padding_info = get_padding_info({ src, dst });
+    auto              padding_info = get_padding_info({src, dst});
     const TensorShape dst_shape    = get_dst_shape(src, perm);
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
@@ -96,7 +100,8 @@
 
     // Create kernel
     CLBuildOptions build_opts;
-    build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
+    build_opts.add_option("-DDATA_TYPE=" +
+                          get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type())));
     build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2)));
     // New positions of  width(W), height(H), channel(C) and batch(D) based on permutation vector
     build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0));
@@ -126,8 +131,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4);
 
@@ -144,9 +150,8 @@
         add_4D_tensor_argument(idx, src, slice_in);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice_in, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
+    } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/kernels/ClPermuteKernel.h b/src/gpu/cl/kernels/ClPermuteKernel.h
index 0d349e7..2413b10 100644
--- a/src/gpu/cl/kernels/ClPermuteKernel.h
+++ b/src/gpu/cl/kernels/ClPermuteKernel.h

@@ -52,7 +52,10 @@
      * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClPermuteKernel::configure()

diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp
index a1afc58..41ab4d6 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,37 +44,47 @@
 
 namespace
 {
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status validate_arguments(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &pool_info,
+                          const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
-                                    "Unsupported combination of parameters!");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2),
+        "Unsupported combination of parameters!");
 
-    const auto   data_layout       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    const int    idx_width         = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int    idx_height        = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const bool   is_global_pooling = pool_info.is_global_pooling;
-    unsigned int pool_size_x       = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    unsigned int pool_size_y       = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    int          output_width      = 0;
-    int          output_height     = 0;
+    const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    const int  idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int  idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const bool is_global_pooling = pool_info.is_global_pooling;
+    unsigned int pool_size_x     = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    unsigned int pool_size_y     = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    int          output_width    = 0;
+    int          output_height   = 0;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
 
-    std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                     pool_size_x, pool_size_y, pool_info.pad_stride_info);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid");
+    std::tie(output_width, output_height) =
+        scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size_x,
+                                 pool_size_y, pool_info.pad_stride_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1),
+                                    "Calculated output dimension size is invalid");
 
     // Check indices
-    if(indices)
+    if (indices)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX,
+                                        "Pooling indices only supported for MAX pooling method");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)),
+                                        "Pooling indices only supported for pool size 2x2");
 
-        if(indices->total_size() != 0)
+        if (indices->total_size() != 0)
         {
             TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32));
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info);
@@ -81,7 +92,7 @@
     }
 
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -98,42 +109,47 @@
     _type = CLKernelType::POOL;
 }
 
-void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices)
+void ClPool2dKernel::configure(const ClCompileContext &compile_context,
+                               ITensorInfo            *src,
+                               ITensorInfo            *dst,
+                               const PoolingLayerInfo &pool_info,
+                               ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices));
 
-    auto padding_info = get_padding_info({ src, dst, indices });
+    auto padding_info = get_padding_info({src, dst, indices});
 
     // Auto init if empty
     TensorShape out_shape = compute_pool_shape(*src, pool_info);
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape));
-    if(indices)
+    if (indices)
     {
         auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32));
     }
 
     // Set instance variables
-    _pool_info                         = pool_info;
-    _data_layout                       = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
-    _num_elems_processed_per_iteration = (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
+    _pool_info   = pool_info;
+    _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout;
+    _num_elems_processed_per_iteration =
+        (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4);
     _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
 
-    int                 pool_stride_x   = 0;
-    int                 pool_stride_y   = 0;
-    const PoolingType   pool_type       = pool_info.pool_type;
-    const int           idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int           idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int           idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int           pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int           pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const PadStrideInfo pad_stride_info = pool_info.pad_stride_info;
-    const bool          exclude_padding = pool_info.exclude_padding;
+    int               pool_stride_x  = 0;
+    int               pool_stride_y  = 0;
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const PadStrideInfo pad_stride_info    = pool_info.pad_stride_info;
+    const bool          exclude_padding    = pool_info.exclude_padding;
     std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride();
-    const int      pool_pad_top  = pad_stride_info.pad_top();
-    const int      pool_pad_left = pad_stride_info.pad_left();
-    const DataType data_type     = src->data_type();
+    const int      pool_pad_top            = pad_stride_info.pad_top();
+    const int      pool_pad_left           = pad_stride_info.pad_left();
+    const DataType data_type               = src->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
@@ -148,20 +164,23 @@
     build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y));
     build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width)));
     build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height)));
-    build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
-    build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
+    build_opts.add_option("-DMAX_WIDTH=" +
+                          support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left)));
+    build_opts.add_option("-DMAX_HEIGHT=" +
+                          support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top)));
 
     // Tensor paddings are used to calculate the indicies for MAX pooling
-    if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+    if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+        is_data_type_float(data_type))
     {
         build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(src->tensor_shape().total_size_lower(3)));
     }
 
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         build_opts.add_option("-DQUANTIZED");
 
-        if(src->quantization_info() != dst->quantization_info())
+        if (src->quantization_info() != dst->quantization_info())
         {
             const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
             const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -174,9 +193,9 @@
     }
 
     // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
+    if (pool_type == PoolingType::MAX)
     {
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             PixelValue type_min{};
             std::tie(type_min, std::ignore) = get_min_max(data_type);
@@ -184,7 +203,9 @@
         }
         else
         {
-            std::string initial_value = pool_info.use_inf_as_limit ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
+            std::string initial_value = pool_info.use_inf_as_limit
+                                            ? "(-INFINITY)"
+                                            : float_to_string_with_full_precision(std::numeric_limits<float>::lowest());
             build_opts.add_option("-DINITIAL_VALUE=" + initial_value);
         }
     }
@@ -195,22 +216,25 @@
     }
 
     // Create kernel
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
             const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision;
             const auto use_wider_accumulator  = use_fp_mixed_precision && (pool_type != PoolingType::MAX);
-            const auto acc_data_type          = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
+            const auto acc_data_type          = get_cl_type_from_data_type(
+                         use_wider_accumulator ? DataType::F32
+                                               : (is_data_type_quantized(data_type) ? DataType::S32 : data_type));
             build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type);
             build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION");
 
-            if(pool_type != PoolingType::MAX)
+            if (pool_type != PoolingType::MAX)
             {
                 build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
             }
 
-            if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type))
+            if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices &&
+                is_data_type_float(data_type))
             {
                 // For max pooling with pool2x2, store indicies which will be used in max unpooling
                 std::string kernel_name = "pooling_layer_2_nchw_indices";
@@ -226,18 +250,19 @@
         case DataLayout::NHWC:
         {
             // Floating point mixed precision is support on F16 only
-            const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+            const auto use_fp_mixed_precision =
+                (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
 
             // Wider accumulation is required to avoid accuracy loss
             // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
             // Cast 2: Quantized (int8/uint8 src data and int32 accumulation )
             DataType acc_data_type = data_type;
 
-            if(use_fp_mixed_precision)
+            if (use_fp_mixed_precision)
             {
                 acc_data_type = DataType::F32;
             }
-            else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
+            else if (is_data_type_quantized(data_type) && pool_type != PoolingType::MAX)
             {
                 acc_data_type = DataType::S32;
             }
@@ -250,8 +275,9 @@
             build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height)));
             build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
             build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
-            if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
+            build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                                  support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+            if (pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type))
             {
                 build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX");
 
@@ -260,7 +286,9 @@
             }
             else
             {
-                std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc";
+                std::string kernel_name = is_data_type_quantized_asymmetric(data_type)
+                                              ? "pooling_layer_MxN_quantized_nhwc"
+                                              : "pooling_layer_MxN_nhwc";
                 _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
             }
             break;
@@ -290,7 +318,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status ClPool2dKernel::validate(const ITensorInfo      *src,
+                                const ITensorInfo      *dst,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices));
     return Status{};
@@ -301,18 +332,19 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    unsigned int pool_stride_x = 0;
-    unsigned int pool_stride_y = 0;
+    unsigned int pool_stride_x             = 0;
+    unsigned int pool_stride_y             = 0;
     std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
-    auto       indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    auto indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1));
 
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -323,13 +355,12 @@
                 unsigned int idx = 0;
                 add_3D_tensor_argument(idx, src, slice);
                 add_3D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
+                if (indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_3D_tensor_argument(idx, indices, slice);
                 }
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window_collapsed.slide_window_slice_3D(slice));
+            } while (window_collapsed.slide_window_slice_3D(slice));
             break;
         }
         case DataLayout::NHWC:
@@ -338,7 +369,8 @@
 
             Window slice    = window_collapsed.first_slice_window_4D();
             Window in_slice = window_collapsed.first_slice_window_4D();
-            in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
+            in_slice.set(Window::DimX,
+                         Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration));
             in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x));
             in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y));
             in_slice.set(3, Window::Dimension(0, batch_size, 1));
@@ -348,13 +380,13 @@
                 unsigned int idx = 0;
                 add_4D_tensor_argument(idx, src, in_slice);
                 add_4D_tensor_argument(idx, dst, slice);
-                if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
+                if (indices && is_data_type_float(src->info()->data_type()) &&
+                    (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2)))
                 {
                     add_4D_tensor_argument(idx, indices, slice);
                 }
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
+            } while (window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice));
             break;
         }
         default:

diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h
index f5bb068..56b95a3 100644
--- a/src/gpu/cl/kernels/ClPool2dKernel.h
+++ b/src/gpu/cl/kernels/ClPool2dKernel.h

@@ -50,22 +50,29 @@
      * @param[in]  pool_info       Contains pooling operation information described in @ref PoolingLayerInfo.
      * @param[out] indices         (optional) The indices of the maximal values. Data type supported: U32.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &pool_info,
+                   ITensorInfo            *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool2dKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &pool_info,
+                           const ITensorInfo      *indices = nullptr);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 public:
     PoolingLayerInfo _pool_info{};
-    DataLayout       _data_layout{ DataLayout::UNKNOWN };
-    unsigned int     _num_elems_processed_per_iteration{ 1 };
+    DataLayout       _data_layout{DataLayout::UNKNOWN};
+    unsigned int     _num_elems_processed_per_iteration{1};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp
index d068832..a08c5d4 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.cpp
+++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -50,10 +51,13 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
 
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), "Strides cannot be zero.");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding
-                                                                                && (pool_info.pool_type == PoolingType::AVG)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        (pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0),
+        "Strides cannot be zero.");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) &&
+                                        (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)),
                                     "Exclude padding is unsupported for non-float types for Avg op");
 
     const auto         data_layout       = src->data_layout();
@@ -68,17 +72,21 @@
     int                output_height     = 0;
     int                output_depth      = 0;
 
-    bool round_type_ceil_with_asymm_padding = (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, "Cannot use dimension round type CEIL when padding is asymmetric.");
+    bool round_type_ceil_with_asymm_padding =
+        (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding,
+                                    "Cannot use dimension round type CEIL when padding is asymmetric.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported");
-    std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
-                                                                                      src->tensor_shape()[idx_depth], pool_size_x, pool_size_y,
-                                                                                      pool_size_z, pool_info);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info),
+                                    "Pooling region that is entirely outside input tensor is unsupported");
+    std::tie(output_width, output_height, output_depth) =
+        scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height],
+                                    src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1),
+                                    "Calculated output dimension size is invalid");
     // Checks performed when dst is configured
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
@@ -95,11 +103,14 @@
     _type = CLKernelType::POOL;
 }
 
-void ClPool3dKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+void ClPool3dKernel::configure(const ClCompileContext   &compile_context,
+                               const ITensorInfo        *src,
+                               ITensorInfo              *dst,
+                               const Pooling3dLayerInfo &pool_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Auto init if empty
     TensorShape out_shape = compute_pool3d_shape(src->tensor_shape(), pool_info);
@@ -112,23 +123,23 @@
     _num_elems_processed_per_iteration = (dst->data_type() == DataType::F32) ? 2 : 4;
     _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0));
 
-    const PoolingType pool_type       = pool_info.pool_type;
-    const int         idx_width       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const int         idx_height      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-    const int         idx_depth       = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
-    const int         idx_channel     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
-    const int         idx_batch_size  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
-    const int         pool_size_x     = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
-    const int         pool_size_y     = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
-    const int         pool_size_z     = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
-    const bool        exclude_padding = pool_info.exclude_padding;
-    const int         pool_stride_x   = pool_info.stride.x();
-    const int         pool_stride_y   = pool_info.stride.y();
-    const int         pool_stride_z   = pool_info.stride.z();
-    const int         pool_pad_top    = pool_info.padding.top;
-    const int         pool_pad_left   = pool_info.padding.left;
-    const int         pool_pad_front  = pool_info.padding.front;
-    const DataType    data_type       = src->data_type();
+    const PoolingType pool_type      = pool_info.pool_type;
+    const int         idx_width      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const int         idx_height     = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const int         idx_depth      = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH);
+    const int         idx_channel    = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
+    const int         idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES);
+    const int         pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+    const int      pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+    const int      pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+    const bool     exclude_padding = pool_info.exclude_padding;
+    const int      pool_stride_x   = pool_info.stride.x();
+    const int      pool_stride_y   = pool_info.stride.y();
+    const int      pool_stride_z   = pool_info.stride.z();
+    const int      pool_pad_top    = pool_info.padding.top;
+    const int      pool_pad_left   = pool_info.padding.left;
+    const int      pool_pad_front  = pool_info.padding.front;
+    const DataType data_type       = src->data_type();
 
     // Set build options
     CLBuildOptions build_opts;
@@ -149,7 +160,7 @@
     build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth)));
 
     // If datatype is quantized add relevant parameters
-    if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iq_info = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info = dst->quantization_info().uniform();
@@ -161,9 +172,9 @@
     }
 
     // Set the initial value for the pooling operation accordingly with the data type
-    if(pool_type == PoolingType::MAX)
+    if (pool_type == PoolingType::MAX)
     {
-        if(is_data_type_quantized(data_type))
+        if (is_data_type_quantized(data_type))
         {
             PixelValue type_min{};
             std::tie(type_min, std::ignore) = get_min_max(data_type);
@@ -171,7 +182,8 @@
         }
         else
         {
-            build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
+            build_opts.add_option("-DINITIAL_VALUE=" +
+                                  float_to_string_with_full_precision(std::numeric_limits<float>::lowest()));
         }
     }
     else
@@ -181,16 +193,18 @@
     }
     // Create kernel
     // Floating point mixed precision is support on F16 only
-    const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
+    const auto use_fp_mixed_precision =
+        (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX;
 
     // Wider accumulation is required to avoid accuracy loss
     // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation)
     DataType acc_data_type = data_type;
-    if(use_fp_mixed_precision)
+    if (use_fp_mixed_precision)
     {
         acc_data_type = DataType::F32;
     }
-    else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
+    else if (is_data_type_quantized(data_type) &&
+             pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division
     {
         acc_data_type = DataType::S32;
     }
@@ -202,11 +216,13 @@
     build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst->dimension(idx_depth)));
     build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel)));
     build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size)));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration));
 
     // if datatype is quantized use quantized kernel function
-    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" : "pooling_3d_layer_MxN_ndhwc");
-    _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+    std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized"
+                                                                            : "pooling_3d_layer_MxN_ndhwc");
+    _kernel                 = create_kernel(compile_context, kernel_name, build_opts.options());
 
     // Configure kernel window
     Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration));
@@ -240,8 +256,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0));
 
     // Collapse 3D window
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);

diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h
index 0085234..6cd229c 100644
--- a/src/gpu/cl/kernels/ClPool3dKernel.h
+++ b/src/gpu/cl/kernels/ClPool3dKernel.h

@@ -50,7 +50,10 @@
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src.
      * @param[in]  pool_info       Contains pooling operation information described in @ref Pooling3dLayerInfo.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &pool_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool3dKernel::configure()
@@ -64,8 +67,8 @@
 
 private:
     Pooling3dLayerInfo _pool_info{};
-    DataLayout         _data_layout{ DataLayout::UNKNOWN };
-    unsigned int       _num_elems_processed_per_iteration{ 1 };
+    DataLayout         _data_layout{DataLayout::UNKNOWN};
+    unsigned int       _num_elems_processed_per_iteration{1};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp
index 5c8bf97..e8df420 100644
--- a/src/gpu/cl/kernels/ClQuantizeKernel.cpp
+++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp

@@ -29,13 +29,12 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
-
 #include "support/Cast.h"
 #include "support/StringSupport.h"
 
@@ -50,12 +49,14 @@
 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F32, DataType::F16);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
 
     // Output must always be initialized
     ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QASYMM16);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst);
 
     return Status{};
@@ -71,7 +72,7 @@
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
@@ -84,7 +85,7 @@
 
     float   scale_to_apply  = qinfo.scale;
     int32_t offset_to_apply = qinfo.offset;
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         /*
          * In case of requantization of a quantized input tensor to an output tensor with another quantization
@@ -132,8 +133,10 @@
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
     build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type));
-    build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
-    std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type);
+    build_opts.add_option_if(
+        multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0)));
+    std::pair<int, int> min_max_quant_values =
+        quantization::get_min_max_values_from_quantized_data_type(output_data_type);
     build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first));
     build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second));
 
@@ -141,9 +144,10 @@
 
     // Configure kernel window
     Window win = calculate_max_window(*src, Steps());
-    if(multi_access_x)
+    if (multi_access_x)
     {
-        win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+        win.set(Window::DimX,
+                Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
     }
     ICLKernel::configure_internal(win);
 
@@ -173,8 +177,7 @@
         add_3D_tensor_argument(idx, src, slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClReshapeKernel.cpp b/src/gpu/cl/kernels/ClReshapeKernel.cpp
index 121bb33..53889f3 100644
--- a/src/gpu/cl/kernels/ClReshapeKernel.cpp
+++ b/src/gpu/cl/kernels/ClReshapeKernel.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -51,7 +52,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
     ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
 
-    if(dst->tensor_shape().total_size() != 0)
+    if (dst->tensor_shape().total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -72,27 +73,17 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
-    std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) };
+    std::set<std::string> build_opts = {"-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())};
     _kernel                          = create_kernel(compile_context, "reshape_layer", build_opts);
 
     // Add static arguments
-    const cl_int2 src_shape =
-    {
-        {
-            static_cast<cl_int>(src->tensor_shape()[0]),
-            static_cast<cl_int>(src->tensor_shape()[1])
-        }
-    };
-    const cl_int2 dst_shape =
-    {
-        {
-            static_cast<cl_int>(dst->tensor_shape()[0]),
-            static_cast<cl_int>(dst->tensor_shape()[1])
-        }
-    };
+    const cl_int2 src_shape = {
+        {static_cast<cl_int>(src->tensor_shape()[0]), static_cast<cl_int>(src->tensor_shape()[1])}};
+    const cl_int2 dst_shape = {
+        {static_cast<cl_int>(dst->tensor_shape()[0]), static_cast<cl_int>(dst->tensor_shape()[1])}};
     unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters
     _kernel.setArg<cl_int2>(idx++, src_shape);
     _kernel.setArg<cl_int2>(idx++, dst_shape);
@@ -119,8 +110,9 @@
     Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ);
     Window slice            = window_collapsed.first_slice_window_3D();
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     // Set srcs
     unsigned int idx = 0;

diff --git a/src/gpu/cl/kernels/ClReshapeKernel.h b/src/gpu/cl/kernels/ClReshapeKernel.h
index db6ab5d..95eae82 100644
--- a/src/gpu/cl/kernels/ClReshapeKernel.h
+++ b/src/gpu/cl/kernels/ClReshapeKernel.h

@@ -58,7 +58,7 @@
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 };
-} // namespace opencl
 } // namespace kernels
+} // namespace opencl
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */

diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp
index 4c4373a..4305aca 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.cpp
+++ b/src/gpu/cl/kernels/ClScaleKernel.cpp

@@ -27,8 +27,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
-#include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/utils/InterpolationPolicyUtils.h"
+#include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -43,7 +44,8 @@
 {
 namespace
 {
-inline std::tuple<float, float> calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
+inline std::tuple<float, float>
+calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners)
 {
     const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
@@ -64,20 +66,25 @@
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::U8, DataType::S16, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(dst == src);
-    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
-    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type()));
+    ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        info.align_corners &&
+        !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy));
+    ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) &&
+                                !is_data_type_quantized_asymmetric(src->data_type()));
 
     float            scale_x     = 0.f;
     float            scale_y     = 0.f;
     const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
-    std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners);
+    std::tie(scale_x, scale_y)   = calculate_scale_factors(src, dst, data_layout, info.align_corners);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f));
+    ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA &&
+                                (scale_x > 1.f || scale_y > 1.f));
 
     return Status{};
 }
@@ -94,23 +101,26 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+void ClScaleKernel::configure(const CLCompileContext &compile_context,
+                              ITensorInfo            *src,
+                              ITensorInfo            *dst,
+                              const ScaleKernelInfo  &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Info required for the static tuning
     _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout;
 
     const bool is_nhwc = _data_layout == DataLayout::NHWC;
 
-    float scale_x = 0.f;
-    float scale_y = 0.f;
+    float scale_x              = 0.f;
+    float scale_y              = 0.f;
     std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
     auto interpolation_policy_to_use = info.interpolation_policy;
-    if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
+    if (info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f)
     {
         interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR;
     }
@@ -127,7 +137,7 @@
     unsigned int       vec_size_leftover = 0;
 
     CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         vec_size          = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels);
         vec_size_leftover = dst_channels % vec_size;
@@ -135,7 +145,8 @@
         build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
         build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER");
         build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type()));
-        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size));
         build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover));
         build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use));
@@ -144,27 +155,33 @@
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
         build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT");
-        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
     }
-    else if(_data_layout == DataLayout::NCHW)
+    else if (_data_layout == DataLayout::NCHW)
     {
         vec_size          = adjust_vec_size(4, dst_width);
         vec_size_leftover = dst_width % vec_size;
         build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
-        build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type()));
+        build_opts.add_option("-DCONSTANT_VALUE=" +
+                              string_from_pixel_value(info.constant_border_value, src->data_type()));
         build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width));
         build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height));
         build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x));
         build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y));
         build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
-        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover)));
+        build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0)
+                                                            ? support::cpp11::to_string(vec_size)
+                                                            : support::cpp11::to_string(vec_size_leftover)));
         build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE");
         build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT");
         build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS");
-        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT");
+        build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER",
+                                      "-DSAMPLING_POLICY_TOP_LEFT");
 
-        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR;
-        if(is_qasymm_bilinear)
+        const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) &&
+                                        info.interpolation_policy == InterpolationPolicy::BILINEAR;
+        if (is_qasymm_bilinear)
         {
             const UniformQuantizationInfo qinfo = src->quantization_info().uniform();
             build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale));
@@ -190,7 +207,7 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 
     // Pass scale kernel arguments
-    if(is_nhwc)
+    if (is_nhwc)
     {
         unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc();
         _kernel.setArg<cl_float>(idx++, scale_x);
@@ -219,7 +236,7 @@
     auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
     auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
-    switch(_data_layout)
+    switch (_data_layout)
     {
         case DataLayout::NCHW:
         {
@@ -231,8 +248,7 @@
                 add_2D_tensor_argument(idx, src, slice);
                 add_2D_tensor_argument(idx, dst, slice);
                 enqueue(queue, *this, slice, lws_hint());
-            }
-            while(window.slide_window_slice_2D(slice));
+            } while (window.slide_window_slice_2D(slice));
             break;
         }
         case DataLayout::NHWC:

diff --git a/src/gpu/cl/kernels/ClScaleKernel.h b/src/gpu/cl/kernels/ClScaleKernel.h
index dd09e92..c096590 100644
--- a/src/gpu/cl/kernels/ClScaleKernel.h
+++ b/src/gpu/cl/kernels/ClScaleKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SCALE_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -49,7 +50,8 @@
      *                             All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]  info            @ref ScaleKernelInfo Kernel descriptor to be used to configure.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClScaleKernel::configure()
@@ -62,7 +64,7 @@
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    DataLayout _data_layout{ DataLayout::UNKNOWN };
+    DataLayout _data_layout{DataLayout::UNKNOWN};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
index 59299fa..1b5a266 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp

@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,15 +62,16 @@
     // Number of integer bits used in temporary fixed-point representation of exponent accumulator
     static const int exp_accumulation_in_bits = 12;
 
-    const double beta_multiplier = std::min(
-                                       1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)),
-                                       (1LL << 31) - 1.0);
+    const double beta_multiplier =
+        std::min(1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), (1LL << 31) - 1.0);
     int input_beta_multiplier;
     int input_beta_left_shift;
-    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift);
+    quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier,
+                                                                  &input_beta_left_shift);
 
-    const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
-    const int    diff_min           = -1.f * std::floor(max_input_rescaled);
+    const double max_input_rescaled =
+        1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift);
+    const int diff_min = -1.f * std::floor(max_input_rescaled);
 
     CLBuildOptions build_opts;
     build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits));
@@ -80,18 +83,22 @@
     return build_opts;
 }
 
-Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
+Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src,
+                                           const ITensorInfo &max,
+                                           const ITensorInfo &dst,
+                                           const ITensorInfo &sum)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max);
 
     const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type());
 
     // Checks performed when output is configured
-    if(dst.total_size() != 0)
+    if (dst.total_size() != 0)
     {
-        if(is_quantized_asymmetric)
+        if (is_quantized_asymmetric)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32);
         }
@@ -103,9 +110,9 @@
     }
 
     // Checks performed when sum is configured
-    if(sum.total_size() != 0)
+    if (sum.total_size() != 0)
     {
-        if(is_quantized_asymmetric)
+        if (is_quantized_asymmetric)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32);
         }
@@ -119,7 +126,10 @@
     return Status{};
 }
 
-Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+Status validate_arguments_1DNorm(const ITensorInfo       &src,
+                                 const ITensorInfo       &sum,
+                                 const ITensorInfo       &dst,
+                                 const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32);
@@ -127,14 +137,15 @@
     ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type));
 
     // Note: output should always have a scale of 1/256 and offset 0
-    const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const bool             is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
+    const QuantizationInfo allowed_quantization_info =
+        get_softmax_output_quantization_info(info.input_data_type, info.is_log);
+    const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
 
     // Checks performed when output is configured
-    if(dst.total_size() != 0)
+    if (dst.total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
-        if(!is_quantized_asymmetric)
+        if (!is_quantized_asymmetric)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
         }
@@ -161,9 +172,14 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info)
+void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext  &compile_context,
+                                               const ITensorInfo       &src,
+                                               ITensorInfo             &max,
+                                               ITensorInfo             &dst,
+                                               ITensorInfo             &sum,
+                                               const SoftmaxKernelInfo &info)
 {
-    auto padding_info = get_padding_info({ &src, &max, &dst, &sum });
+    auto padding_info = get_padding_info({&src, &max, &dst, &sum});
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape()));
@@ -191,15 +207,21 @@
     build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size))));
     build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE");
     build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED");
-    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f),
+                             "-DBETA=" + float_to_string_with_full_precision(beta));
     build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX");
-    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX")));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
-    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DBETA=" + float_to_string_with_full_precision(beta));
-    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
+    build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX")
+                                                                                          : std::string("-FLT_MAX")));
+    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                             "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale));
+    build_opts.add_option_if(is_data_type_quantized_asymmetric(dt),
+                             "-DBETA=" + float_to_string_with_full_precision(beta));
+    build_opts.add_options_if(is_data_type_quantized_asymmetric(dt),
+                              prepare_quantized_softmax_build_options(qinfo.scale, beta).options());
 
     cl::NDRange lws_hint(cl::NullRange);
-    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial";
+    std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") +
+                              (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial";
 
     // Create kernel.
     _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
@@ -211,7 +233,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum)
+Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src,
+                                                const ITensorInfo &max,
+                                                const ITensorInfo &dst,
+                                                const ITensorInfo &sum)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum));
     return Status{};
@@ -241,7 +266,7 @@
 
     // Reconfigure window in case of parallel reduction
     ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0));
-    if(std::get<0>(parallel_reduction_info))
+    if (std::get<0>(parallel_reduction_info))
     {
         // Launch grid_size parallel work items
         window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1));
@@ -258,8 +283,7 @@
         add_3D_tensor_argument(idx, dst, slice);
         add_3D_tensor_argument(idx, sum, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 
 ClLogits1DNormKernel::ClLogits1DNormKernel()
@@ -267,18 +291,24 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info)
+void ClLogits1DNormKernel::configure(const CLCompileContext  &compile_context,
+                                     const ITensorInfo       &src,
+                                     const ITensorInfo       &sum,
+                                     ITensorInfo             &dst,
+                                     const SoftmaxKernelInfo &info)
 {
-    auto padding_info = get_padding_info({ &src, &dst, &sum });
+    auto padding_info = get_padding_info({&src, &dst, &sum});
 
     // Note: output should always have a scale of 1/256 and offset 0
-    const bool                    is_quantized_asymmetric   = is_data_type_quantized_asymmetric(info.input_data_type);
-    const DataType                output_data_type          = info.input_data_type;
-    const QuantizationInfo        allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log);
-    const UniformQuantizationInfo qinfo                     = src.quantization_info().uniform();
+    const bool             is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type);
+    const DataType         output_data_type        = info.input_data_type;
+    const QuantizationInfo allowed_quantization_info =
+        get_softmax_output_quantization_info(info.input_data_type, info.is_log);
+    const UniformQuantizationInfo qinfo = src.quantization_info().uniform();
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
+    auto_init_if_empty(dst,
+                       src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info));
@@ -311,7 +341,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+Status ClLogits1DNormKernel::validate(const ITensorInfo       &src,
+                                      const ITensorInfo       &sum,
+                                      const ITensorInfo       &dst,
+                                      const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info));
 
@@ -343,9 +376,8 @@
         add_3D_tensor_argument(idx, sum, sum_slice);
         add_3D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window_collapsed.slide_window_slice_3D(slice));
+    } while (window_collapsed.slide_window_slice_3D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h
index a221e12..2dd53da 100644
--- a/src/gpu/cl/kernels/ClSoftmaxKernel.h
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -61,14 +62,20 @@
      * @param[out]    sum             Sum of 1D logits tensor. Data types supported: same as @p src
      * @param[in]     info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &max,
+                   ITensorInfo             &dst,
+                   ITensorInfo             &sum,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
+    static Status
+    validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum);
     /** Checks if the given size is eligible for parallel reduction
      *
      * @note  Serial reduction is launched for width < (_grid_size * _serial_vector_size).
@@ -100,14 +107,19 @@
      * @param[out] dst             Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   const ITensorInfo       &sum,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClLogits1DNormKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    static Status
+    validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;

diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp
index 6450ffb..6eb2bf8 100644
--- a/src/gpu/cl/kernels/ClTransposeKernel.cpp
+++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp

@@ -29,9 +29,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -58,12 +59,12 @@
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape));
 
     ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Create kernel
-    const unsigned int vec_size_x           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
+    const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0));
     const int          vec_size_x_leftovers = src->dimension(0) % vec_size_x;
-    const unsigned int vec_size_y           = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1));
+    const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1));
     const int          vec_size_y_leftovers = src->dimension(1) % vec_size_y;
 
     CLBuildOptions build_opts;
@@ -89,9 +90,10 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported");
 
     // Validate configured dst
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
-        const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
+        const TensorInfo dst_info =
+            src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
@@ -106,8 +108,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_2D();
 
@@ -117,9 +120,8 @@
         add_2D_tensor_argument(idx, src, slice);
         add_2D_tensor_argument(idx, dst, slice);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_2D(slice));
+    } while (window.slide_window_slice_2D(slice));
 }
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
index ae82569..76f39ac 100644
--- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp

@@ -26,14 +26,14 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
 
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -42,11 +42,15 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status validate_arguments(const ITensorInfo   *input,
+                          const ITensorInfo   *weights,
+                          const ITensorInfo   *biases,
+                          const ITensorInfo   *output,
                           const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32,
+                                                         DataType::QASYMM8_SIGNED, DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(weights, DataLayout::NHWC);
@@ -56,12 +60,13 @@
     constexpr unsigned int height_idx  = 2;
     constexpr unsigned int batch_idx   = 3;
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx), "Weights feature map dimension should match the respective src's one");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
+                                    "Weights feature map dimension should match the respective src's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -77,15 +82,17 @@
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         const size_t input_width    = input->dimension(width_idx);
         const size_t input_height   = input->dimension(height_idx);
         const size_t weights_width  = weights->dimension(width_idx);
         const size_t weights_height = weights->dimension(height_idx);
 
-        auto        out_dims     = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
-        TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+        auto out_dims =
+            deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+        TensorShape output_shape =
+            misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -96,8 +103,12 @@
 }
 } // namespace
 
-void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                                              const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info)
+void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context,
+                                              const ITensorInfo      *input,
+                                              const ITensorInfo      *weights,
+                                              const ITensorInfo      *biases,
+                                              ITensorInfo            *output,
+                                              const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_UNUSED(biases, deconv_info);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
@@ -119,7 +130,8 @@
     const size_t output_channels = output->dimension(channel_idx);
 
     // Calculate output shape
-    auto        out_dims     = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
+    auto out_dims =
+        deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info);
     TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
     auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->quantization_info());
 
@@ -147,7 +159,7 @@
     const DataType    input_data_type = input->data_type();
     const PaddingInfo strides         = deconv_info.stride();
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         build_options.add_option(std::string("-DHAS_BIAS"));
         build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type())));
@@ -180,7 +192,7 @@
     build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0));
     build_options.add_option_if((input_channels % k0) != 0, "-DLEFTOVER_LOOP");
 
-    if(is_data_type_quantized(output_data_type))
+    if (is_data_type_quantized(output_data_type))
     {
         const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
@@ -210,7 +222,7 @@
         build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0));
     }
 
-    if(compile_context.get_ddk_version() >= 30)
+    if (compile_context.get_ddk_version() >= 30)
     {
         build_options.add_option("-fregister-allocation=64");
     }
@@ -235,8 +247,11 @@
     _config_id += support::cpp11::to_string(n0);
 }
 
-Status ClTransposedConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases,
-                                               const ITensorInfo *dst, const PadStrideInfo &deconv_info)
+Status ClTransposedConvolutionKernel::validate(const ITensorInfo   *src,
+                                               const ITensorInfo   *weights,
+                                               const ITensorInfo   *biases,
+                                               const ITensorInfo   *dst,
+                                               const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, deconv_info));
     return Status{};
@@ -250,17 +265,20 @@
     // Get initial windows
     Window slice = window.first_slice_window_3D();
 
-    const auto src     = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
-    const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
-    const auto biases  = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
-    auto       dst     = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
+    const auto weights =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+    const auto biases =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4d_tensor_nhwc_argument(idx, src);
     add_4d_tensor_nhwc_argument(idx, dst);
 
     add_4d_tensor_nhwc_argument(idx, weights);
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         add_1D_tensor_argument(idx, biases, slice);
     }

diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
index d4350dd..44f6f56 100644
--- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h
+++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h

@@ -45,16 +45,23 @@
      * Similar to @ref ClTransposedConvolution::configure()
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                   const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClTransposedConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                           const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -63,4 +70,4 @@
 } // namespace opencl
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */

diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
index 8f36345..af80c4d 100644
--- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp

@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
@@ -39,7 +41,10 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups)
+Status validate_arguments(const ITensorInfo *input,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *output,
+                          unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
@@ -48,20 +53,24 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1);
     ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0);
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type()));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
         ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
-        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) &&
+                                    (biases->dimension(0) != input->tensor_shape()[3]));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (input->num_dimensions() == 5) &&
+            (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(
+            output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
     }
@@ -75,16 +84,21 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups)
+void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context,
+                                       const ITensorInfo      *src,
+                                       const ITensorInfo      *biases,
+                                       ITensorInfo            *dst,
+                                       unsigned int            num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto inizialitation if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
+    auto_init_if_empty(
+        *dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups)));
 
     // Perform validation step
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups));
-    auto padding_info = get_padding_info({ src, biases, dst });
+    auto padding_info = get_padding_info({src, biases, dst});
 
     const DataType data_type = src->data_type();
 
@@ -104,7 +118,10 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups)
+Status ClWeightsReshapeKernel::validate(const ITensorInfo *src,
+                                        const ITensorInfo *biases,
+                                        const ITensorInfo *dst,
+                                        unsigned int       num_groups)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups));
     return Status{};
@@ -136,7 +153,7 @@
     _kernel.setArg<cl_uint>(idx++, src->info()->dimension(3));
     _kernel.setArg<cl_uint>(idx++, dst->info()->strides_in_bytes().z());
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         biases_window.use_tensor_dimensions(biases->info()->tensor_shape());
         biases_slice = biases_window.first_slice_window_1D();
@@ -148,7 +165,7 @@
         unsigned idx = 0;
         add_3D_tensor_argument(idx, src, in_slice);
         add_2D_tensor_argument(idx, dst, out_slice);
-        if(biases != nullptr)
+        if (biases != nullptr)
         {
             add_1D_tensor_argument(idx, biases, biases_slice);
             ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice));
@@ -156,8 +173,7 @@
 
         // Run kernel
         enqueue(queue, *this, in_slice, lws_hint());
-    }
-    while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
+    } while (window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
index 7364eb9..5e05f8d 100644
--- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h
+++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h

@@ -75,14 +75,19 @@
      * @param[in]  num_groups      (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout
      *                             Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups = 1);
+    void configure(const ClCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *dst,
+                   unsigned int            num_groups = 1);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWeightsReshapeKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
+    static Status
+    validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -90,4 +95,4 @@
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
\ No newline at end of file
+#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */

diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
index 0a9a3f0..1519502 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp

@@ -29,11 +29,11 @@
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/tensor_info.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -52,7 +52,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
         ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
@@ -63,7 +63,8 @@
 }
 } // namespace
 
-Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
+Status
+ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst));
     return Status{};
@@ -74,12 +75,15 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst)
+void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context,
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst));
 
-    auto padding_info = get_padding_info({ src1, src2, dst });
+    auto padding_info = get_padding_info({src1, src2, dst});
 
     const unsigned int min_dimension                     = std::min(src1->dimension(0), src2->dimension(0));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
@@ -91,11 +95,12 @@
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
 
     // If input have different quantization info set quantization parameters needed for the re-quantization process
     const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -146,9 +151,11 @@
 
     Window slice = window.first_slice_window_4D();
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     do
     {
@@ -159,8 +166,7 @@
         _kernel.setArg<cl_int>(idx++, _depth);
         _kernel.setArg<cl_int>(idx++, _input1_width);
         enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
+    } while (window.slide_window_slice_4D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
index 5c54479..8b53d6d 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h

@@ -62,8 +62,8 @@
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
-    int32_t _input1_width{ 0 };
+    int32_t _depth{0};
+    int32_t _input1_width{0};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
index 54f7ad3..c4f84e3 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp

@@ -30,11 +30,11 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/core/utils/helpers/tensor_info.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -45,15 +45,20 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+Status validate_arguments(const ITensorInfo *src1,
+                          const ITensorInfo *src2,
+                          const ITensorInfo *src3,
+                          const ITensorInfo *src4,
+                          const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1);
     ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) >
+                                dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i));
         ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i));
@@ -71,22 +76,29 @@
     _type = CLKernelType::ELEMENTWISE;
 }
 
-Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst)
+Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1,
+                                                  const ITensorInfo *src2,
+                                                  const ITensorInfo *src3,
+                                                  const ITensorInfo *src4,
+                                                  const ITensorInfo *dst)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst));
     return Status{};
 }
 
 void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context,
-                                                 ITensorInfo *src1, ITensorInfo *src2,
-                                                 ITensorInfo *src3, ITensorInfo *src4,
-                                                 ITensorInfo *dst)
+                                                 ITensorInfo            *src1,
+                                                 ITensorInfo            *src2,
+                                                 ITensorInfo            *src3,
+                                                 ITensorInfo            *src4,
+                                                 ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst));
 
-    auto               padding_info                      = get_padding_info({ src1, src2, src3, src4, dst });
-    const unsigned int min_dimension                     = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
+    auto               padding_info = get_padding_info({src1, src2, src3, src4, dst});
+    const unsigned int min_dimension =
+        std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0)));
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension);
     const unsigned int vec_size_leftover                 = dst->dimension(0) % num_elems_processed_per_iteration;
 
@@ -96,9 +108,14 @@
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
     build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
     build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size()));
-    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
-    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT2_ROTATE_N=" +
+                          support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) %
+                                                    num_elems_processed_per_iteration));
+    build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) +
+                                                                            src3->dimension(0) - vec_size_leftover) %
+                                                                           num_elems_processed_per_iteration));
 
     _depth        = src1->dimension(2);
     _input1_width = src1->dimension(0);
@@ -106,8 +123,9 @@
     _input3_width = src3->dimension(0);
 
     // If soources have different quantization info set quantization parameters needed for the re-quantization process
-    const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
-    if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
+    const bool have_different_qinfo =
+        helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4);
+    if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo)
     {
         const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform();
         const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform();
@@ -166,11 +184,15 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
-    const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
-    const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
-    const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
-    auto       dst  = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src0 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC));
+    const auto src1 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1));
+    const auto src2 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2));
+    const auto src3 =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     Window slice = window.first_slice_window_4D();
 
@@ -187,8 +209,7 @@
         _kernel.setArg<cl_int>(idx++, _input2_width);
         _kernel.setArg<cl_int>(idx++, _input3_width);
         enqueue(queue, *this, window, lws_hint());
-    }
-    while(window.slide_window_slice_4D(slice));
+    } while (window.slide_window_slice_4D(slice));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
index baf8d38..f589b8a 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h

@@ -52,23 +52,32 @@
      * @param[in]  src4            Fourth source tensor info. Data types supported: same as @p src1
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src1,
+                   ITensorInfo            *src2,
+                   ITensorInfo            *src3,
+                   ITensorInfo            *src4,
+                   ITensorInfo            *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClWidthConcatenate4TensorsKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst);
+    static Status validate(const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *src3,
+                           const ITensorInfo *src4,
+                           const ITensorInfo *dst);
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
-    int32_t _input1_width{ 0 };
-    int32_t _input2_width{ 0 };
-    int32_t _input3_width{ 0 };
+    int32_t _depth{0};
+    int32_t _input1_width{0};
+    int32_t _input2_width{0};
+    int32_t _input3_width{0};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
index 2dfe7fc..989de4a 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp

@@ -30,10 +30,10 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 namespace arm_compute
@@ -53,7 +53,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0));
 
-    for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i));
     }
@@ -74,12 +74,15 @@
     return Status{};
 }
 
-void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst)
+void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context,
+                                         ITensorInfo            *src,
+                                         unsigned int            width_offset,
+                                         ITensorInfo            *dst)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0));
 
@@ -87,10 +90,11 @@
     CLBuildOptions build_opts;
     build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
     build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration));
-    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
+    build_opts.add_option("-DVEC_SIZE_LEFTOVER=" +
+                          support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration));
     build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset));
 
-    if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
+    if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info())
     {
         const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
         const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
@@ -121,8 +125,9 @@
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
 
-    const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
-    auto       dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+    const auto src =
+        utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+    auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
 
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);

diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
index 3ace440..c10d6a4 100644
--- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h
+++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h

@@ -50,7 +50,8 @@
      * @param[in,out] dst             Destination tensor info. Data types supported: same as @p src.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClWidthConcatenateKernel::configure()
@@ -63,7 +64,7 @@
     void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override;
 
 private:
-    int32_t _depth{ 0 };
+    int32_t _depth{0};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
index 7148a4c..58c01d4 100644
--- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp

@@ -29,10 +29,11 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
@@ -60,14 +61,18 @@
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd filter transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width ||
+                                input->dimension(idx_h) != kernel_size.height);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -81,11 +86,15 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(output);
 
-    const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
+    const unsigned int num_elems_processed_per_iteration_x =
+        input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1;
     const unsigned int num_elems_processed_per_iteration_y = input->dimension(1);
-    const unsigned int num_elems_read_per_iteration_z      = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
+    const unsigned int num_elems_read_per_iteration_z =
+        input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2);
 
-    Window win           = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z));
+    Window win =
+        calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y,
+                                           num_elems_read_per_iteration_z));
     Window win_collapsed = win.collapse(win, Window::DimZ);
     return std::make_pair(Status{}, win_collapsed);
 }
@@ -96,21 +105,25 @@
     _type = CLKernelType::WINOGRAD;
 }
 
-void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
+void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context,
+                                                ITensorInfo            *src,
+                                                ITensorInfo            *dst,
+                                                const WinogradInfo     &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     // Set build options
     CLBuildOptions build_opts;
 
     // For NHWC layouts pass tensor dimesions at runtime
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         _src_dim_z = src->dimension(2);
     }
@@ -125,7 +138,8 @@
     const Size2D output_tile_size = winograd_info.output_tile_size;
 
     // Create kernel
-    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
+    std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout()));
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -138,7 +152,9 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));
 }
 
-Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
+Status ClWinogradFilterTransformKernel::validate(const ITensorInfo  *src,
+                                                 const ITensorInfo  *dst,
+                                                 const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first);
@@ -161,7 +177,7 @@
     unsigned int idx = 0;
     add_4D_tensor_argument(idx, src, window);
     add_3D_tensor_argument(idx, dst, window_out);
-    if(src->info()->data_layout() == DataLayout::NHWC)
+    if (src->info()->data_layout() == DataLayout::NHWC)
     {
         _kernel.setArg<cl_uint>(idx++, _src_dim_z);
     }

diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
index b213030..6e439f0 100644
--- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,10 @@
      * @param[out] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input
      * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradFilterTransformKernel::configure()
@@ -72,7 +76,7 @@
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
 
 private:
-    int32_t _src_dim_z{ 0 };
+    int32_t _src_dim_z{0};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
index fab6c36..54c4898 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp

@@ -32,6 +32,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -55,17 +56,21 @@
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1,
+                                    "Winograd input transform only supports unit strides");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()),
+        "Winograd input transform not supported");
 
     ARM_COMPUTE_UNUSED(conv_info);
     ARM_COMPUTE_UNUSED(output_tile_size);
     ARM_COMPUTE_UNUSED(kernel_size);
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
+        const TensorShape output_shape =
+            misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -74,7 +79,8 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
+std::pair<Status, Window>
+validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_UNUSED(output);
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
@@ -82,7 +88,7 @@
     bool window_changed                    = false;
     int  num_elems_processed_per_iteration = 1;
 
-    if(input->data_layout() == DataLayout::NHWC)
+    if (input->data_layout() == DataLayout::NHWC)
     {
         // In the case of FP16 computation, we can perform more
         // output feature maps in a single work-item.
@@ -94,9 +100,9 @@
         const size_t   dim0 = input->dimension(0);
         const size_t   k_sz = winograd_info.kernel_size.area();
         const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
-        if(cond)
+        if (cond)
         {
-            if(k_sz == 3 || k_sz == 9)
+            if (k_sz == 3 || k_sz == 9)
             {
                 num_elems_processed_per_iteration = 2;
             }
@@ -104,7 +110,7 @@
     }
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
-    if(input->data_layout() == DataLayout::NCHW)
+    if (input->data_layout() == DataLayout::NCHW)
     {
         const PadStrideInfo conv_info        = winograd_info.convolution_info;
         const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -113,11 +119,13 @@
         unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1;
         unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1;
 
-        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
+        AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(),
+                                           num_elems_read_per_iteration_x, num_elems_read_per_iteration_y);
         window_changed = update_window_and_padding(win, input_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -132,12 +140,15 @@
     return _border_size;
 }
 
-void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info)
+void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context,
+                                               ITensorInfo            *src,
+                                               ITensorInfo            *dst,
+                                               const WinogradInfo     &winograd_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info));
 
-    auto padding_info = get_padding_info({ src, dst });
+    auto padding_info = get_padding_info({src, dst});
 
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
@@ -150,14 +161,13 @@
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
     const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)),
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+                                                                kernel_size, output_tile_size, conv_info);
 
     _num_tiles_x = num_tiles.width;
     _num_tiles_y = num_tiles.height;
 
-    const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorShape output_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
 
     // Output auto initialization if not yet initialized
     auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape));
@@ -174,7 +184,7 @@
     _src_height = src->dimension(idx_h);
 
     CLBuildOptions build_opts;
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         build_opts.add_option("-DNHWC");
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
@@ -201,13 +211,14 @@
     }
 
     // Create kernel
-    std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
+    std::string kernel_name =
+        "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string();
 
     // Get the maximum dimension from the tile size
     const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height);
 
     // Check optimized kernel if output_dims == 2x2
-    if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
+    if ((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW))
     {
         _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2;
     }
@@ -239,11 +250,14 @@
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }
 
-Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info)
+Status ClWinogradInputTransformKernel::validate(const ITensorInfo  *src,
+                                                const ITensorInfo  *dst,
+                                                const WinogradInfo &winograd_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first);
     return Status{};
 }
 
@@ -263,7 +277,7 @@
     // Collapse window
     Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ);
 
-    if(_data_layout == DataLayout::NHWC)
+    if (_data_layout == DataLayout::NHWC)
     {
         Window slice = window_collapsed.first_slice_window_3D();
         slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1));
@@ -298,8 +312,7 @@
             add_3D_tensor_argument(idx, dst, slice);
 
             enqueue(queue, *this, slice, lws_hint());
-        }
-        while(window_collapsed.slide_window_slice_3D(slice));
+        } while (window_collapsed.slide_window_slice_3D(slice));
     }
 }
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
index c10c528..cebebea 100644
--- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -59,7 +60,10 @@
      * @param[in] dst             The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input
      * @param[in] winograd_info   Contains Winograd's information described in @ref WinogradInfo.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const WinogradInfo     &winograd_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradInputTransformKernel::configure()
@@ -69,19 +73,19 @@
     static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info);
 
     // Inherited methods overridden:
-    void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
+    void       run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
     BorderSize border_size() const override;
 
 private:
     using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    BorderSize   _border_size{ 0 };
-    DataLayout   _data_layout{ DataLayout::UNKNOWN };
-    int          _num_tiles_x{ 0 };
-    int          _num_tiles_y{ 0 };
-    unsigned int _step_z{ 1 };
-    int32_t      _src_width{ 0 };
-    int32_t      _src_height{ 0 };
+    BorderSize   _border_size{0};
+    DataLayout   _data_layout{DataLayout::UNKNOWN};
+    int          _num_tiles_x{0};
+    int          _num_tiles_y{0};
+    unsigned int _step_z{1};
+    int32_t      _src_width{0};
+    int32_t      _src_height{0};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
index bf974d3..89c80c5 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp

@@ -23,7 +23,6 @@
  */
 #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
 
-#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -31,10 +30,12 @@
 #include "arm_compute/core/IAccessWindow.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/Window.h"
+#include "arm_compute/core/utils/ActivationFunctionUtils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
@@ -54,7 +55,11 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+Status validate_arguments(const ITensorInfo         *input,
+                          const ITensorInfo         *bias,
+                          const ITensorInfo         *output,
+                          const WinogradInfo        &winograd_info,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16);
@@ -66,30 +71,32 @@
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        input_dimensions = winograd_info.input_dimensions;
-    const unsigned int  num_channels     = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
+    const unsigned int  num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) *
+                                      (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1);
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout),
+        "Winograd output transform not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels");
 
     // Compute number of elements to process in the X and Y direction
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
 
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area())));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0));
     }
 
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info));
 
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -98,14 +105,17 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size)
+std::pair<Status, Window> validate_and_configure_window(ITensorInfo  *input,
+                                                        ITensorInfo  *bias,
+                                                        ITensorInfo  *output,
+                                                        const Size2D &output_tile_size)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_UNUSED(bias);
 
     unsigned int num_elems_processed_per_iteration = 1;
 
-    if(input->data_layout() == DataLayout::NHWC)
+    if (input->data_layout() == DataLayout::NHWC)
     {
         // In the case of FP16 computation, we can perform more
         // output feature maps in a single work-item.
@@ -115,7 +125,7 @@
         const DataType dt   = input->data_type();
         const size_t   dim0 = input->dimension(0);
         const bool     cond = dt == DataType::F16 && ((dim0 % 2) == 0);
-        if(cond)
+        if (cond)
         {
             num_elems_processed_per_iteration = 2;
         }
@@ -124,17 +134,19 @@
     Window win            = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
     bool   window_changed = false;
 
-    if(output->data_layout() == DataLayout::NCHW)
+    if (output->data_layout() == DataLayout::NCHW)
     {
         const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width);
         const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height);
 
-        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration);
+        AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration,
+                                           num_elems_processed_per_iteration);
         AccessWindowStatic    output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y);
         window_changed = update_window_and_padding(win, input_access, output_access);
     }
 
-    Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
+    Status err =
+        (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
     return std::make_pair(err, win);
 }
 } // namespace
@@ -144,13 +156,18 @@
     _type = CLKernelType::WINOGRAD;
 }
 
-void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
+void ClWinogradOutputTransformKernel::configure(const ClCompileContext    &compile_context,
+                                                ITensorInfo               *src,
+                                                ITensorInfo               *bias,
+                                                ITensorInfo               *dst,
+                                                const WinogradInfo        &winograd_info,
                                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
 
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
+    auto_init_if_empty(*dst,
+                       src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info)));
 
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info));
 
@@ -159,7 +176,7 @@
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     IClKernel::configure_internal(win_config.second);
 
-    auto padding_info = get_padding_info({ src, bias, dst });
+    auto padding_info = get_padding_info({src, bias, dst});
 
     _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC;
 
@@ -168,14 +185,13 @@
     const Size2D        kernel_size      = winograd_info.kernel_size;
     const Size2D        output_tile_size = winograd_info.output_tile_size;
     const PadStrideInfo conv_info        = winograd_info.convolution_info;
-    const int           idx_width        = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
-    const int           idx_height       = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
+    const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height =
+        get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the number of output tiles along the x and y direction of size "output_tile_size"
-    const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions,
-                                                                kernel_size,
-                                                                output_tile_size,
-                                                                conv_info);
+    const Size2D num_tiles =
+        compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info);
     const size_t total_batches = dst->tensor_shape().total_size_upper(3);
 
     // Set build options
@@ -184,11 +200,11 @@
     build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a()));
     build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b()));
 
-    if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
+    if ((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2))
     {
         build_opts.add_option("-DVEC_SIZE=2");
     }
-    else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
+    else if ((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4))
     {
         build_opts.add_option("-DVEC_SIZE=4");
     }
@@ -200,9 +216,10 @@
     const auto      act_function  = act_info.activation();
     const auto      src_data_type = src->data_type();
 
-    if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST)
-       && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU)
-       && (src_data_type == DataType::F32 || src_data_type == DataType::F16))
+    if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) &&
+        (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU ||
+         act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) &&
+        (src_data_type == DataType::F32 || src_data_type == DataType::F16))
     {
         // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations
         // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations
@@ -213,7 +230,7 @@
         build_opts.add_option("-cl-fast-relaxed-math");
     }
 
-    if(_is_nhwc)
+    if (_is_nhwc)
     {
         build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS"));
         build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step()));
@@ -247,7 +264,9 @@
     _dst_height = dst->dimension(idx_height);
 
     // Create kernel
-    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout));
+    std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" +
+                              kernel_size.to_string() + "_" +
+                              lower_string(string_from_data_layout(winograd_info.output_data_layout));
 
     // A macro guard to compile ONLY the kernel of interest
     build_opts.add_option("-D" + upper_string(kernel_name));
@@ -271,10 +290,18 @@
     ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc);
 }
 
-Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
+Status ClWinogradOutputTransformKernel::validate(const ITensorInfo         *src,
+                                                 const ITensorInfo         *bias,
+                                                 const ITensorInfo         *dst,
+                                                 const WinogradInfo        &winograd_info,
+                                                 const ActivationLayerInfo &act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
+                                                              (bias != nullptr ? bias->clone().get() : nullptr),
+                                                              dst->clone().get(), winograd_info.output_tile_size)
+                                    .first);
     return Status{};
 }
 
@@ -299,7 +326,7 @@
     slice_out.set(Window::DimX, Window::Dimension(0, 0, 0));
     slice_out.set(Window::DimY, Window::Dimension(0, 0, 0));
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
         unsigned int idx1 = 2 * num_arguments_per_4D_tensor();
         Window       slice_biases;
@@ -307,7 +334,7 @@
         add_1D_tensor_argument(idx1, bias, slice_biases);
     }
 
-    if(_is_nhwc)
+    if (_is_nhwc)
     {
         unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0);
         _kernel.setArg(idx2++, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y()));
@@ -322,8 +349,7 @@
         add_4D_tensor_argument(idx, src, slice);
         add_4D_tensor_argument(idx, dst, slice_out);
         enqueue(queue, *this, slice, lws_hint());
-    }
-    while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
+    } while (window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out));
 }
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
index 6f01896..65bb963 100644
--- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h
+++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/common/Macros.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
@@ -61,7 +62,11 @@
      * @param[in]  winograd_info   Contains Winograd's information described in @ref WinogradInfo
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *bias,
+                   ITensorInfo               *dst,
+                   const WinogradInfo        &winograd_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -69,7 +74,11 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *bias,
+                           const ITensorInfo         *dst,
+                           const WinogradInfo        &winograd_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
     void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override;
@@ -77,11 +86,11 @@
 private:
     using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    bool    _is_nhwc{ false };
-    int32_t _src_height{ 0 };
-    int32_t _dst_width{ 0 };
-    int32_t _dst_height{ 0 };
-    int32_t _num_tiles_x{ 0 };
+    bool    _is_nhwc{false};
+    int32_t _src_height{0};
+    int32_t _dst_width{0};
+    int32_t _dst_height{0};
+    int32_t _num_tiles_x{0};
 };
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
index 9350bf7..b5ebac3 100644
--- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp

@@ -39,14 +39,24 @@
 {
 namespace gemm
 {
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image)
 {
     ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0);
     ARM_COMPUTE_ERROR_ON(v0 == 0);
     v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1));
 
-    if(h0 == 0)
+    if (h0 == 0)
     {
         // When h0 is 0, we should take the maximum H0 possible
         h0 = std::max(n / n0, 1U);
@@ -62,17 +72,22 @@
     return std::make_pair(lhs_info, rhs_info);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, "The fallback GeMM configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true,
+                             "The fallback GeMM configuration cannot have export_to_cl_image = true");
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, data_type);
     const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second);
     const TensorInfo  tensor_reshaped_info(shape, 1, data_type);
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second)))
     {
         return info_img;
     }
@@ -90,42 +105,56 @@
     const unsigned int pixel_alignment      = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device());
 
     ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment");
-    if(pixel_alignment == 0)
+    if (pixel_alignment == 0)
     {
         return;
     }
 
     const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel;
-    const unsigned int round_up_width      = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
-    const unsigned int padding             = round_up_width - stride_y_in_elements;
+    const unsigned int round_up_width =
+        ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment;
+    const unsigned int padding = round_up_width - stride_y_in_elements;
 
     tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0));
 }
 
 Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info)
 {
-    if(rhs_info.export_to_cl_image)
+    if (rhs_info.export_to_cl_image)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, "Export to cl_image only supported with n0 = 4, 8 or 16");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, "Export to cl_image only supported with k0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false,
+                                        "Export to cl_image only supported with n0 = 4, 8 or 16");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true,
+                                        "Export to cl_image only supported with k0 = 4, 8 or 16");
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16);
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()),
+            "The extension cl_khr_image2d_from_buffer is not supported on the target platform");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0,
+                                        "Impossible to retrieve the cl_image pitch alignment");
 
         // Check the width and height of the output tensor.
         // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension
         const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>();
         const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>();
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4,
+                                        "Not supported width for cl_image");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+            tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h,
+            "Not supported height for cl_image");
     }
 
     return Status{};
 }
 
-bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b,
-                              const DataType data_type, unsigned int &best_m0, unsigned int &best_n0)
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0)
 {
     ARM_COMPUTE_UNUSED(n, k, b, data_type);
 
@@ -141,7 +170,8 @@
     return ((k % mmul_k0) == 0) && (gws_y > 4);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     size_t min_acc = std::numeric_limits<size_t>::max();
     size_t min_idx = 0;
@@ -150,12 +180,13 @@
     const size_t num_rows = configs.size();
     const size_t num_cols = configs[0].size();
 
-    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, "
+                                              "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS");
     ARM_COMPUTE_UNUSED(num_cols);
 
     // Find nearest GeMM workload
     // Note: the workload does not depend on the K dimension
-    for(size_t y = 0; y < num_rows; ++y)
+    for (size_t y = 0; y < num_rows; ++y)
     {
         size_t mc0 = static_cast<size_t>(configs[y][0]);
         size_t nc0 = static_cast<size_t>(configs[y][1]);
@@ -168,7 +199,7 @@
         acc += (k - kc0) * (k - kc0);
         acc += (b - bc0) * (b - bc0);
         acc = std::sqrt(acc);
-        if(acc < min_acc)
+        if (acc < min_acc)
         {
             min_acc = acc;
             min_idx = y;

diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
index 6689b10..84776fb 100644
--- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h
+++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h

@@ -54,8 +54,18 @@
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0,
-                                                                       bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m,
+                                                                       unsigned int n,
+                                                                       unsigned int m0,
+                                                                       unsigned int n0,
+                                                                       unsigned int k0,
+                                                                       unsigned int v0,
+                                                                       unsigned int h0,
+                                                                       bool         lhs_interleave,
+                                                                       bool         rhs_interleave,
+                                                                       bool         lhs_transpose,
+                                                                       bool         rhs_transpose,
+                                                                       bool         export_to_cl_image = false);
 
 /** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  *
@@ -72,9 +82,13 @@
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
-                                                                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
-                                                                    unsigned int n, unsigned int k, unsigned int b, DataType data_type);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img,
+                    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf,
+                    unsigned int                                    n,
+                    unsigned int                                    k,
+                    unsigned int                                    b,
+                    DataType                                        data_type);
 
 /** Update padding required to export the OpenCL buffer to OpenCL image2d
  *
@@ -103,8 +117,13 @@
  *
  * @return true if MMUL kernel is preferred over kernels w/o MMUL, false otherwise
  */
-bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b,
-                              const DataType data_type, unsigned int &best_m0, unsigned int &best_n0);
+bool is_mmul_kernel_preferred(const unsigned int m,
+                              const unsigned int n,
+                              const unsigned int k,
+                              const unsigned int b,
+                              const DataType     data_type,
+                              unsigned int      &best_m0,
+                              unsigned int      &best_n0);
 
 /** Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by the user
  *
@@ -116,7 +135,8 @@
  *
  * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo
  */
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl

diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
index a49836c..9d08633 100644
--- a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 #include <array>
@@ -56,8 +57,7 @@
      * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -69,7 +69,7 @@
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -96,8 +96,7 @@
      *
      * @param[in] arch GPU target
      */
-    IClGemmKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClGemmKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig);
@@ -111,7 +110,8 @@
      * @param[in] b         Batch size
      * @param[in] data_type Data type
      */
-    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
+    virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0;
 
 protected:
     GPUTarget _target;

diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
index d74c7fa..2f37eef 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,31 +39,34 @@
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G71_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_G76_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32,
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
-                                                                    &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32,
+        &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic
+        &ClGemmDefaultConfigNativeBifrost::configure_default_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -79,18 +83,19 @@
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -105,20 +110,21 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 2048)
+            if (n < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
             }
-            else if(n >= 2048 && n < 16384)
+            else if (n >= 2048 && n < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -129,7 +135,7 @@
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
             }
@@ -141,9 +147,9 @@
     }
     else
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 8192)
+            if (n < 8192)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -159,24 +165,25 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n > 4196)
+        if (n > 4196)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false);
         }
         else
         {
-            if(k < 2048)
+            if (k < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false);
             }
-            else if(k >= 2048 && k < 16384)
+            else if (k >= 2048 && k < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
             }
@@ -192,18 +199,19 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 16384)
+        else if (n >= 2048 && n < 16384)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
         }
@@ -214,7 +222,7 @@
     }
     else
     {
-        if(m < 64)
+        if (m < 64)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
         }
@@ -225,7 +233,8 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -233,7 +242,8 @@
     return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -243,4 +253,4 @@
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
index 9af5dc4..f822daa 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h

@@ -45,15 +45,22 @@
     ClGemmDefaultConfigNativeBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
index b9f36c7..f87fb1b 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,18 +39,17 @@
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr,
-                                                                        nullptr,
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, nullptr,
                                                                         &ClGemmDefaultConfigNativeMidgard::default_q8);
 
     auto func = configs_default.get_function(data_type);
@@ -57,7 +57,8 @@
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -70,4 +71,4 @@
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
index c055753..fa76c5d 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h

@@ -45,10 +45,12 @@
     ClGemmDefaultConfigNativeMidgard(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
index 95a4d2b..97a1298 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,37 +39,38 @@
 {
 namespace gemm
 {
-ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
-                                                                        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClGemmDefaultConfigNativeValhall::configure_G77_f32, &ClGemmDefaultConfigNativeValhall::configure_G77_f16,
+        &ClGemmDefaultConfigNativeValhall::configure_G77_u8);
 
     auto func = configs_default.get_function(data_type);
     ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM");
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -83,18 +85,19 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n < 2048)
+        if (n < 2048)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false);
         }
-        else if(n >= 2048 && n < 8192)
+        else if (n >= 2048 && n < 8192)
         {
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false);
         }
@@ -109,20 +112,21 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 2048)
+            if (n < 2048)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false);
             }
-            else if(n >= 2048 && n < 16384)
+            else if (n >= 2048 && n < 16384)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -133,7 +137,7 @@
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false);
             }
@@ -145,9 +149,9 @@
     }
     else
     {
-        if(m == 1)
+        if (m == 1)
         {
-            if(n < 8192)
+            if (n < 8192)
             {
                 return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false);
             }
@@ -165,4 +169,4 @@
 } // namespace gemm
 } // namespace kernels
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
index f0f812f..c91b095 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h

@@ -45,12 +45,16 @@
     ClGemmDefaultConfigNativeValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
index cf84128..955bb3c 100644
--- a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h

@@ -51,7 +51,7 @@
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu);

diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
index 657018e..c956c34 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -43,30 +44,31 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -83,12 +85,13 @@
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
     }
@@ -98,12 +101,13 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false);
     }
@@ -113,14 +117,15 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(n <= 4)
+        if (n <= 4)
         {
             return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true);
         }
@@ -131,7 +136,7 @@
     }
     else
     {
-        if(n <= 4)
+        if (n <= 4)
         {
             return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true);
         }
@@ -142,7 +147,8 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
@@ -154,100 +160,108 @@
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(workload <= 274.4000f)
+    if (workload <= 274.4000f)
     {
-        if(r_nk <= 0.7461f)
+        if (r_nk <= 0.7461f)
         {
-            if(r_mn <= 21.1667f)
+            if (r_mn <= 21.1667f)
             {
                 return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
     else
     {
-        if(r_mk <= 17.3926f)
+        if (r_mk <= 17.3926f)
         {
-            if(workload <= 542.4000f)
+            if (workload <= 542.4000f)
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            if(r_nk <= 0.5463f)
+            if (r_nk <= 0.5463f)
             {
-                if(workload <= 11767.6001f)
+                if (workload <= 11767.6001f)
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
                 else
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
 
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 323.4000f)
+    if (workload <= 323.4000f)
     {
         return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false);
     }
@@ -257,7 +271,8 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -268,7 +283,7 @@
     GEMMRHSMatrixInfo rhs_info_img;
 
     // Get lhs_info/rhs_info in case of OpenCL buffer
-    if(n <= 4)
+    if (n <= 4)
     {
         std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true);
     }
@@ -279,15 +294,17 @@
 
     // Get lhs_info/rhs_info in case of OpenCL image
     // Condition on the GPU workload
-    if((m / 4) * (n / 4) >= 2560)
+    if ((m / 4) * (n / 4) >= 2560)
     {
         // Big workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true);
     }
     else
     {
         // Small workload
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true);
     }
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
@@ -297,7 +314,7 @@
     // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d
     const bool use_cl_image2d = (n <= 4) ? false : true;
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
     {
         return std::make_pair(lhs_info_img, rhs_info_img);
     }
@@ -307,16 +324,17 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
 
-    if(workload <= 1595.2000f)
+    if (workload <= 1595.2000f)
     {
-        if(r_mk <= 2.1044f)
+        if (r_mk <= 2.1044f)
         {
-            if(workload <= 870.4000f)
+            if (workload <= 870.4000f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false);
             }
@@ -336,12 +354,13 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true);
     }

diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
index d86d1ba..9227ec2 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h

@@ -45,17 +45,26 @@
     ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
index 58d0873..70b324e 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <utility>
@@ -38,26 +39,27 @@
 {
 namespace gemm
 {
-ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu)
-    : IClGemmKernelConfig(gpu)
+ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu)
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_f32, &ClGemmDefaultConfigReshapedValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedValhall::configure_G78_f32, &ClGemmDefaultConfigReshapedValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedValhall::configure_G77_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G78:
             func = configs_G78.get_function(data_type);
@@ -72,12 +74,13 @@
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1);
     }
@@ -87,7 +90,8 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -104,17 +108,17 @@
 
     std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0);
 
-    if(r_mk <= 0.11824845522642136)
+    if (r_mk <= 0.11824845522642136)
     {
-        if(workload <= 880.0)
+        if (workload <= 880.0)
         {
             return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
         }
         else
         {
-            if(r_nk <= 0.42521367967128754)
+            if (r_nk <= 0.42521367967128754)
             {
-                if(workload <= 1726.4000244140625)
+                if (workload <= 1726.4000244140625)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0);
                 }
@@ -123,13 +127,12 @@
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
             else
             {
-                if(workload <= 1241.6000366210938)
+                if (workload <= 1241.6000366210938)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0);
                 }
@@ -142,17 +145,16 @@
     }
     else
     {
-        if(workload <= 11404.7998046875)
+        if (workload <= 11404.7998046875)
         {
-            if(r_mk <= 1.0126488208770752)
+            if (r_mk <= 1.0126488208770752)
             {
-                if(r_mn <= 2.545312523841858)
+                if (r_mn <= 2.545312523841858)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -161,43 +163,39 @@
             }
             else
             {
-                if(workload <= 2881.199951171875)
+                if (workload <= 2881.199951171875)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
         }
         else
         {
-            if(r_nk <= 0.5765306055545807)
+            if (r_nk <= 0.5765306055545807)
             {
-                if(r_mn <= 6.010416746139526)
+                if (r_mn <= 6.010416746139526)
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
                     std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
             else
@@ -205,27 +203,27 @@
                 std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 1288.0000f)
+    if (workload <= 1288.0000f)
     {
-        if(workload <= 505.6000f)
+        if (workload <= 505.6000f)
         {
-            if(r_mn <= 0.4466f)
+            if (r_mn <= 0.4466f)
             {
-                if(r_nk <= 0.2384f)
+                if (r_nk <= 0.2384f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                 }
@@ -241,9 +239,9 @@
         }
         else
         {
-            if(r_mn <= 0.2250f)
+            if (r_mn <= 0.2250f)
             {
-                if(r_mn <= 0.1599f)
+                if (r_mn <= 0.1599f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                 }
@@ -254,11 +252,11 @@
             }
             else
             {
-                if(r_mk <= 0.7609f)
+                if (r_mk <= 0.7609f)
                 {
-                    if(r_mn <= 2.5453f)
+                    if (r_mn <= 2.5453f)
                     {
-                        if(workload <= 1089.6000f)
+                        if (workload <= 1089.6000f)
                         {
                             return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1);
                         }
@@ -281,29 +279,29 @@
     }
     else
     {
-        if(workload <= 5434.4001f)
+        if (workload <= 5434.4001f)
         {
-            if(workload <= 1603.2000f)
+            if (workload <= 1603.2000f)
             {
                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 0.6192f)
+                if (r_nk <= 0.6192f)
                 {
-                    if(r_mn <= 16.1016f)
+                    if (r_mn <= 16.1016f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                     }
                     else
                     {
-                        if(workload <= 2750.0000f)
+                        if (workload <= 2750.0000f)
                         {
                             return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
                         else
                         {
-                            if(r_mk <= 6.3151f)
+                            if (r_mk <= 6.3151f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
                             }
@@ -316,15 +314,15 @@
                 }
                 else
                 {
-                    if(r_mk <= 0.0387f)
+                    if (r_mk <= 0.0387f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                     }
                     else
                     {
-                        if(r_mk <= 2.5859f)
+                        if (r_mk <= 2.5859f)
                         {
-                            if(r_mk <= 0.2734f)
+                            if (r_mk <= 0.2734f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                             }
@@ -343,13 +341,13 @@
         }
         else
         {
-            if(r_mk <= 25.7500f)
+            if (r_mk <= 25.7500f)
             {
-                if(r_mk <= 0.3615f)
+                if (r_mk <= 0.3615f)
                 {
-                    if(r_mn <= 0.0913f)
+                    if (r_mn <= 0.0913f)
                     {
-                        if(r_mk <= 0.0683f)
+                        if (r_mk <= 0.0683f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
                         }
@@ -365,15 +363,15 @@
                 }
                 else
                 {
-                    if(workload <= 11174.3999f)
+                    if (workload <= 11174.3999f)
                     {
-                        if(r_mk <= 0.8047f)
+                        if (r_mk <= 0.8047f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
                         else
                         {
-                            if(workload <= 7185.5999f)
+                            if (workload <= 7185.5999f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1);
                             }
@@ -385,9 +383,9 @@
                     }
                     else
                     {
-                        if(workload <= 17917.5000f)
+                        if (workload <= 17917.5000f)
                         {
-                            if(r_mk <= 1.5078f)
+                            if (r_mk <= 1.5078f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                             }
@@ -398,7 +396,7 @@
                         }
                         else
                         {
-                            if(workload <= 34449.6016f)
+                            if (workload <= 34449.6016f)
                             {
                                 return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                             }
@@ -412,11 +410,11 @@
             }
             else
             {
-                if(r_mk <= 331.1111f)
+                if (r_mk <= 331.1111f)
                 {
-                    if(workload <= 53397.5996f)
+                    if (workload <= 53397.5996f)
                     {
-                        if(r_mn <= 57.8063f)
+                        if (r_mn <= 57.8063f)
                         {
                             return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                         }
@@ -427,7 +425,7 @@
                     }
                     else
                     {
-                        if(r_nk <= 0.9211f)
+                        if (r_nk <= 0.9211f)
                         {
                             return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1);
                         }
@@ -439,7 +437,7 @@
                 }
                 else
                 {
-                    if(workload <= 38070.4004f)
+                    if (workload <= 38070.4004f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1);
                     }
@@ -453,27 +451,28 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(workload <= 801.6000f)
+    if (workload <= 801.6000f)
     {
         return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
     }
     else
     {
-        if(r_mn <= 0.1211f)
+        if (r_mn <= 0.1211f)
         {
-            if(workload <= 3296.0000f)
+            if (workload <= 3296.0000f)
             {
                 return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 1.0625f)
+                if (r_nk <= 1.0625f)
                 {
                     return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                 }
@@ -485,15 +484,15 @@
         }
         else
         {
-            if(workload <= 5068.8000f)
+            if (workload <= 5068.8000f)
             {
                 return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
             }
             else
             {
-                if(r_nk <= 0.2361f)
+                if (r_nk <= 0.2361f)
                 {
-                    if(workload <= 12630.0000f)
+                    if (workload <= 12630.0000f)
                     {
                         return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1);
                     }
@@ -504,7 +503,7 @@
                 }
                 else
                 {
-                    if(workload <= 178790.3984f)
+                    if (workload <= 178790.3984f)
                     {
                         return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1);
                     }
@@ -518,12 +517,13 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(n <= 4)
+    if (n <= 4)
     {
         return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1);
     }

diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
index 466eda0..5f62efb 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h

@@ -45,14 +45,20 @@
     ClGemmDefaultConfigReshapedValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
index 1c32f13..83928b3 100644
--- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h

@@ -50,7 +50,7 @@
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:

diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
index 9c23d9c..c4825bf 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp

@@ -29,7 +29,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -47,33 +49,39 @@
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G76:
             func = configs_G76.get_function(data_type);
@@ -96,14 +104,15 @@
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n <= 2548)
+        if (n <= 2548)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false);
         }
@@ -118,12 +127,13 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
@@ -131,7 +141,7 @@
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
+        if (m >= 28)
         {
             return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1);
         }
@@ -142,7 +152,8 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
@@ -154,9 +165,9 @@
 
     const bool is_workload_big = ((m * n * b) / 16) >= 2048;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n >= 8192)
+        if (n >= 8192)
         {
             const unsigned int h0 = std::max(n / 4, 1U);
             return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false);
@@ -164,7 +175,7 @@
         else
         {
             const unsigned int h0 = std::max(n / 2, 1U);
-            if(n <= 204)
+            if (n <= 204)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false);
             }
@@ -177,25 +188,29 @@
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-        if(is_workload_big)
+        if (is_workload_big)
         {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true);
         }
         else
         {
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true);
         }
     }
 
     // Get lhs_info/rhs_info in case of OpenCL image
     const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1));
-    if(is_workload_big)
+    if (is_workload_big)
     {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true);
     }
     else
     {
-        std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
+        std::tie(lhs_info_img, rhs_info_img) =
+            configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true);
     }
 
     const TensorInfo  tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32);
@@ -205,7 +220,7 @@
     // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d
     const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true;
 
-    if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
+    if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d)
     {
         return std::make_pair(lhs_info_img, rhs_info_img);
     }
@@ -215,7 +230,8 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
@@ -225,46 +241,49 @@
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(r_nk <= 0.4664f)
+        if (r_nk <= 0.4664f)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false);
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
     else
     {
-        if(workload <= 274.4000f)
+        if (workload <= 274.4000f)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false);
         }
         else
         {
-            std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
-            std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
+            std::tie(lhs_info_img, rhs_info_img) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true);
+            std::tie(lhs_info_buf, rhs_info_buf) =
+                configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false);
 
             return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                       std::make_pair(lhs_info_buf, rhs_info_buf),
-                                       n, k, b, DataType::F32);
+                                       std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int n0 = n < 1280 ? 2 : 4;
         const unsigned int h0 = std::max(n / n0, 1U);
@@ -276,14 +295,15 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(n > 2048)
+        if (n > 2048)
         {
             const unsigned int h0 = std::max(n / 4, 1U);
             return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true);
@@ -300,7 +320,8 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
@@ -312,57 +333,59 @@
     GEMMLHSMatrixInfo lhs_info_img;
     GEMMRHSMatrixInfo rhs_info_img;
 
-    if(m == 1)
+    if (m == 1)
     {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false);
 
-        if(r_mk <= 0.0026f)
+        if (r_mk <= 0.0026f)
         {
-            if(r_nk <= 0.4664f)
+            if (r_nk <= 0.4664f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
         else
         {
-            if(r_mk <= 0.0148f)
+            if (r_mk <= 0.0148f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
             }
             else
             {
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true);
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
     else
     {
-        std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
+        std::tie(lhs_info_buf, rhs_info_buf) =
+            configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false);
 
-        if(workload <= 362.6000f)
+        if (workload <= 362.6000f)
         {
             return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
         }
         else
         {
-            if(r_mn <= 22.6067f)
+            if (r_mn <= 22.6067f)
             {
-                if(workload <= 708.8000f)
+                if (workload <= 708.8000f)
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -371,27 +394,28 @@
             }
             else
             {
-                if(r_nk <= 0.0917f)
+                if (r_nk <= 0.0917f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false);
                 }
                 else
                 {
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true);
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
 
-    if(m == 1)
+    if (m == 1)
     {
         return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false);
     }
@@ -400,15 +424,15 @@
         const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
         const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-        if(workload <= 7449.60f)
+        if (workload <= 7449.60f)
         {
-            if(workload <= 691.60f)
+            if (workload <= 691.60f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false);
             }
             else
             {
-                if(workload <= 4155.20f)
+                if (workload <= 4155.20f)
                 {
                     return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
                 }
@@ -420,21 +444,22 @@
         }
         else
         {
-            if(workload <= 16300.80f)
+            if (workload <= 16300.80f)
             {
-                if(r_mn <= 44.56f)
+                if (r_mn <= 44.56f)
                 {
                     GEMMLHSMatrixInfo lhs_info_buf;
                     GEMMRHSMatrixInfo rhs_info_buf;
                     GEMMLHSMatrixInfo lhs_info_img;
                     GEMMRHSMatrixInfo rhs_info_img;
 
-                    std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
-                    std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                    std::tie(lhs_info_img, rhs_info_img) =
+                        configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true);
+                    std::tie(lhs_info_buf, rhs_info_buf) =
+                        configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F16);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
                 }
                 else
                 {
@@ -448,23 +473,25 @@
                 GEMMLHSMatrixInfo lhs_info_img;
                 GEMMRHSMatrixInfo rhs_info_img;
 
-                std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
-                std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
+                std::tie(lhs_info_img, rhs_info_img) =
+                    configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true);
+                std::tie(lhs_info_buf, rhs_info_buf) =
+                    configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F16);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int n0 = n < 1280 ? 2 : 4;
         const unsigned int h0 = std::max(n / n0, 1U);
@@ -476,14 +503,15 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(dot8_supported(CLKernelLibrary::get().get_device()))
+    if (dot8_supported(CLKernelLibrary::get().get_device()))
     {
-        if(m == 1)
+        if (m == 1)
         {
             const unsigned int h0 = std::max(n / 2, 1U);
             return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
@@ -497,7 +525,7 @@
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1));
-        if(m == 1)
+        if (m == 1)
         {
             return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true);
         }
@@ -508,12 +536,13 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true);
@@ -524,12 +553,13 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true);

diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
index 321cbb5..77c0c8d 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h

@@ -45,21 +45,34 @@
     ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
index d08bf84..da3e2ec 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp

@@ -50,30 +50,35 @@
 {
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type)
 {
-    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k,
-                                             unsigned int b);
+    using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (
+        ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
-                                                                    &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
-    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
-                                                                     &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
+    CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16,
+        &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G78:
             func = configs_G78.get_function(data_type);
@@ -96,29 +101,29 @@
     return (this->*func)(m, n, k, b);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    if(m == 1)
+    if (m == 1)
     {
         const float r_mn = static_cast<float>(m) / static_cast<float>(n);
         const float r_mk = static_cast<float>(m) / static_cast<float>(k);
 
-        if(r_mk <= 0.0064484127797186375)
+        if (r_mk <= 0.0064484127797186375)
         {
-            if(r_mn <= 0.0028273810748942196)
+            if (r_mn <= 0.0028273810748942196)
             {
                 GEMMLHSMatrixInfo lhs_info_buf;
                 GEMMRHSMatrixInfo rhs_info_buf;
                 GEMMLHSMatrixInfo lhs_info_img;
                 GEMMRHSMatrixInfo rhs_info_img;
 
-                const unsigned int h0 = std::max(n / 4, 1U);
+                const unsigned int h0                = std::max(n / 4, 1U);
                 std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1);
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
             else
             {
@@ -127,7 +132,7 @@
         }
         else
         {
-            if(r_mk <= 0.020312500186264515)
+            if (r_mk <= 0.020312500186264515)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0);
             }
@@ -143,9 +148,9 @@
         const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
         const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
 
-        if(workload <= 1999.2000122070312)
+        if (workload <= 1999.2000122070312)
         {
-            if(workload <= 747.1999816894531)
+            if (workload <= 747.1999816894531)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
             }
@@ -159,15 +164,14 @@
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
         else
         {
-            if(r_mn <= 0.03348214365541935)
+            if (r_mn <= 0.03348214365541935)
             {
-                if(r_mk <= 0.028125000186264515)
+                if (r_mk <= 0.028125000186264515)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
                 }
@@ -181,8 +185,7 @@
                     std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0);
 
                     return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                               std::make_pair(lhs_info_buf, rhs_info_buf),
-                                               n, k, b, DataType::F32);
+                                               std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
                 }
             }
             else
@@ -195,168 +198,112 @@
                 std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0);
 
                 return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img),
-                                           std::make_pair(lhs_info_buf, rhs_info_buf),
-                                           n, k, b, DataType::F32);
+                                           std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32);
             }
         }
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    const GeMMConfigsMatrix configs_1nkb_best =
-    {
-        { 1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0 },
-        { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0 },
-        { 1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0 },
-        { 1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0},    {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},    {1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0},    {1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0},
+        {1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_fallback = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0},
+                                                             {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0},
+                                                             {16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0},
+                                                             {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},   {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},     {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},  {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},    {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0},
+        {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},  {12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0},
+        {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_small_best =
-    {
-        { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 },
-        { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1 }
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {
+        {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0},
+        {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_small_fallback =
-    {
-        { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 },
-        { 16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},  {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 },
-        { 369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0},
+        {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}};
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0 },
-        { 369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 },
-        { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
 
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best =
-    {
-        { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 },
-        { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1 },
-        { 49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback =
-    {
-        { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 },
-        { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 },
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_squared_best =
-    {
-        { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 },
-        { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_squared_fallback =
-    {
-        { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 },
-        { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_best_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_fallback_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},   {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0},  {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}};
 
     const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
     const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if(b == 1)
+    if (b == 1)
     {
         constexpr float        ratio_m_gt_n = 10.f;
         constexpr float        ratio_n_gt_m = 0.1f;
         constexpr unsigned int n_small_thr  = 4;
         const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
 
-        if(m == 1)
+        if (m == 1)
         {
             // We do not need fallback in this case, as we never use cl_image for the rhs tensor
             configs_best_to_use     = &configs_1nkb_best;
             configs_fallback_to_use = &configs_1nkb_best;
         }
-        else if(n <= n_small_thr && ratio > ratio_m_gt_n)
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_n_small_best;
             configs_fallback_to_use = &configs_mnkb_n_small_fallback;
         }
-        else if(ratio > ratio_m_gt_n)
+        else if (ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_m_gt_n_best;
             configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
         }
-        else if(ratio < ratio_n_gt_m)
+        else if (ratio < ratio_n_gt_m)
         {
             configs_best_to_use     = &configs_mnkb_n_gt_m_best;
             configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
@@ -381,17 +328,17 @@
     std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
     std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
 
-    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0),
-                               std::make_pair(lhs_info1, rhs_info1),
-                               n, k, b, DataType::F16);
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     ARM_COMPUTE_UNUSED(k);
     ARM_COMPUTE_UNUSED(b);
 
-    if(m == 1)
+    if (m == 1)
     {
         const unsigned int h0 = std::max(n / 2, 1U);
         return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1);
@@ -399,7 +346,7 @@
     else
     {
         const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1));
-        if(m >= 28)
+        if (m >= 28)
         {
             return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1);
         }
@@ -410,30 +357,31 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
 
-    if(m == 1)
+    if (m == 1)
     {
-        if(workload <= 278.7000f)
+        if (workload <= 278.7000f)
         {
-            if(workload <= 7.5000f)
+            if (workload <= 7.5000f)
             {
                 return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
             }
             else
             {
-                if(r_mn <= 0.0031f)
+                if (r_mn <= 0.0031f)
                 {
-                    if(workload <= 256.6000f)
+                    if (workload <= 256.6000f)
                     {
-                        if(workload <= 16.7500f)
+                        if (workload <= 16.7500f)
                         {
-                            if(r_nk <= 1.6671f)
+                            if (r_nk <= 1.6671f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                             }
@@ -454,15 +402,15 @@
                 }
                 else
                 {
-                    if(r_mk <= 0.0027f)
+                    if (r_mk <= 0.0027f)
                     {
-                        if(r_mk <= 0.0014f)
+                        if (r_mk <= 0.0014f)
                         {
                             return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                         }
                         else
                         {
-                            if(workload <= 8.9500f)
+                            if (workload <= 8.9500f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
                             }
@@ -474,13 +422,13 @@
                     }
                     else
                     {
-                        if(workload <= 14.1500f)
+                        if (workload <= 14.1500f)
                         {
                             return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0);
                         }
                         else
                         {
-                            if(r_mk <= 0.0041f)
+                            if (r_mk <= 0.0041f)
                             {
                                 return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0);
                             }
@@ -495,9 +443,9 @@
         }
         else
         {
-            if(workload <= 363.7000f)
+            if (workload <= 363.7000f)
             {
-                if(r_mk <= 0.0031f)
+                if (r_mk <= 0.0031f)
                 {
                     return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0);
                 }
@@ -514,9 +462,9 @@
     }
     else
     {
-        if(workload <= 1384.8000f)
+        if (workload <= 1384.8000f)
         {
-            if(workload <= 704.0000f)
+            if (workload <= 704.0000f)
             {
                 return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0);
             }
@@ -527,9 +475,9 @@
         }
         else
         {
-            if(workload <= 16761.6006f)
+            if (workload <= 16761.6006f)
             {
-                if(r_mn <= 187.1250f)
+                if (r_mn <= 187.1250f)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1);
                 }
@@ -540,7 +488,7 @@
             }
             else
             {
-                if(r_mk <= 432.4630f)
+                if (r_mk <= 432.4630f)
                 {
                     return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1);
                 }
@@ -553,42 +501,37 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f;
     const float r_mn     = static_cast<float>(m) / static_cast<float>(n);
     const float r_mk     = static_cast<float>(m) / static_cast<float>(k);
     const float r_nk     = static_cast<float>(n) / static_cast<float>(k);
 
-    if(m == 1)
+    if (m == 1)
     {
-        const GeMMConfigsMatrix configs_mnkb_best =
-        {
-            { 1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0 },
-            { 1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0 },
-            { 1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-            { 1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }
-        };
+        const GeMMConfigsMatrix configs_mnkb_best = {
+            {1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0},   {1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0},
+            {1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+            {1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}};
 
         return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b);
     }
     else
     {
-        if(workload <= 1384.8000f)
+        if (workload <= 1384.8000f)
         {
-            if(r_nk <= 0.8333f)
+            if (r_nk <= 0.8333f)
             {
-                if(r_mk <= 0.9119f)
+                if (r_mk <= 0.9119f)
                 {
                     return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 4, 0, 1, 0, 1, 1);
                 }
                 else
                 {
-                    if(r_nk <= 0.1181f)
+                    if (r_nk <= 0.1181f)
                     {
                         return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0);
                     }
@@ -600,7 +543,7 @@
             }
             else
             {
-                if(r_mk <= 1.0013f)
+                if (r_mk <= 1.0013f)
                 {
                     return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1);
                 }
@@ -612,11 +555,11 @@
         }
         else
         {
-            if(workload <= 11404.7998f)
+            if (workload <= 11404.7998f)
             {
-                if(r_mk <= 2.2884f)
+                if (r_mk <= 2.2884f)
                 {
-                    if(r_nk <= 0.9286f)
+                    if (r_nk <= 0.9286f)
                     {
                         return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 4, 0, 1, 1, 0, 1);
                     }
@@ -632,9 +575,9 @@
             }
             else
             {
-                if(r_nk <= 1.1926f)
+                if (r_nk <= 1.1926f)
                 {
-                    if(r_mn <= 1385.7917f)
+                    if (r_mn <= 1385.7917f)
                     {
                         return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 4, 0, 1, 1, 0, 1);
                     }
@@ -652,12 +595,13 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
     {
         return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
     }
@@ -667,153 +611,101 @@
     }
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
-    const GeMMConfigsMatrix configs_1nkb_best =
-    {
-        { 1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 },
-        { 1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0 },
-        { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }
+    const GeMMConfigsMatrix configs_1nkb_best = {
+        {1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0},   {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},
+        {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0},   {1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},   {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0},
+        {1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0},
+                                                         {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},    {2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},   {16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},   {3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_small_best =
-    {
-        { 102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 },
-        { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }
+    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = {
+        {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0},    {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0},
+        {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},  {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0},
+        {8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},   {2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0},
+        {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0},
+        {12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0},   {3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0},
+        {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0},   {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_best =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1 },
-        { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 },
-        { 16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                        {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0},
+                                                            {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}};
+
+    const GeMMConfigsMatrix configs_mnkb_squared_best = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback =
-    {
-        { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0 },
-        { 8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
-        { 2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 },
-        { 16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 },
+    const GeMMConfigsMatrix configs_mnkb_squared_fallback = {
+        {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},    {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0},
+        {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0},   {268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},
+        {180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},  {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0},
     };
 
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_best =
-    {
-        { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_best_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1}, {4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1},  {24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1},  {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}};
 
-    const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback =
-    {
-        { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_squared_best =
-    {
-        { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 },
-        { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1 },
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_squared_fallback =
-    {
-        { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 },
-        { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 },
-        { 268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-        { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0 },
-        { 196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 },
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_best_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1 },
-        { 24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }
-    };
-
-    const GeMMConfigsMatrix configs_mnkb_fallback_batched =
-    {
-        { 3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 },
-        { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 },
-        { 5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0 },
-        { 1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 },
-        { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }
-    };
+    const GeMMConfigsMatrix configs_mnkb_fallback_batched = {
+        {3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0},  {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0},
+        {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0},  {24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0},
+        {112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0},
+        {1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}};
 
     const GeMMConfigsMatrix *configs_best_to_use     = nullptr;
     const GeMMConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if(b == 1)
+    if (b == 1)
     {
         constexpr float        ratio_m_gt_n = 10.f;
         constexpr float        ratio_n_gt_m = 0.1f;
         constexpr unsigned int n_small_thr  = 4;
         const float            ratio        = static_cast<float>(m) / static_cast<float>(n);
 
-        if(m == 1)
+        if (m == 1)
         {
             // We do not need fallback in this case, as we never use cl_image for the rhs tensor
             configs_best_to_use     = &configs_1nkb_best;
             configs_fallback_to_use = &configs_1nkb_best;
         }
-        else if(n <= n_small_thr && ratio > ratio_m_gt_n)
+        else if (n <= n_small_thr && ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_n_small_best;
             configs_fallback_to_use = &configs_mnkb_n_small_best;
         }
-        else if(ratio > ratio_m_gt_n)
+        else if (ratio > ratio_m_gt_n)
         {
             configs_best_to_use     = &configs_mnkb_m_gt_n_best;
             configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback;
         }
-        else if(ratio < ratio_n_gt_m)
+        else if (ratio < ratio_n_gt_m)
         {
             configs_best_to_use     = &configs_mnkb_n_gt_m_best;
             configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback;
@@ -838,17 +730,17 @@
     std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b);
     std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b);
 
-    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0),
-                               std::make_pair(lhs_info1, rhs_info1),
-                               n, k, b, DataType::F16);
+    return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b,
+                               DataType::F16);
 }
 
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b)
 {
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+    if (is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
     {
         return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true);
     }

diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
index f2952a3..a0ea337 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h

@@ -45,17 +45,26 @@
     ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override;
 
 private:
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
-    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+    std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+    configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b);
 };
 } // namespace gemm
 } // namespace kernels

diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
index 1503e74..e07ad99 100644
--- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h
+++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h

@@ -50,7 +50,7 @@
      */
     static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:

diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
index 2407c6c..689a743 100644
--- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp

@@ -36,7 +36,9 @@
 {
 namespace kernels
 {
-Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info)
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
+                                    const MatMulKernelInfo &matmul_kernel_info)
 {
     const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x();
     const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y();
@@ -46,7 +48,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty");
 
     constexpr size_t batch_dim_start = 2;
-    for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
+    for (size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported");
     }
@@ -54,9 +56,12 @@
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs,
-                                                                         const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
-                                                                         int mmul_m0, int mmul_n0)
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0)
 {
     ARM_COMPUTE_UNUSED(lhs, rhs);
 

diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
index 210f22b..c2ae2a6 100644
--- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h
+++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h

@@ -44,7 +44,8 @@
  *
  * @return true if the shapes and matmul kernel info matches
  */
-Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape,
+Status validate_matmul_input_shapes(const TensorShape      &lhs_shape,
+                                    const TensorShape      &rhs_shape,
                                     const MatMulKernelInfo &matmul_kernel_info);
 
 /** Validate and configure window for Matmul MMUL kernels
@@ -58,9 +59,12 @@
  *
  * @return a pair of Status and Window object
  */
-std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs,
-                                                                         const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info,
-                                                                         int mmul_m0, int mmul_n0);
+std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo      *lhs,
+                                                                         const ITensorInfo      *rhs,
+                                                                         const ITensorInfo      *dst,
+                                                                         const MatMulKernelInfo &matmul_kernel_info,
+                                                                         int                     mmul_m0,
+                                                                         int                     mmul_n0);
 } // namespace kernels
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp
index 74a818d..66877eb 100644
--- a/src/gpu/cl/operators/ClActivation.cpp
+++ b/src/gpu/cl/operators/ClActivation.cpp

@@ -23,19 +23,21 @@
  */
 #include "src/gpu/cl/operators/ClActivation.h"
 
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClActivationKernel.h"
-
 #include "src/common/IOperator.h"
 #include "src/common/utils/LegacySupport.h"
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/ClContext.h"
+#include "src/gpu/cl/kernels/ClActivationKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClActivation::configure(const ClCompileContext    &compile_context,
+                             ITensorInfo               *src,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, act_info);
     auto k = std::make_unique<kernels::ClActivationKernel>();
@@ -53,13 +55,17 @@
 {
 namespace opencl
 {
-std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate)
+std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor     &src,
+                                                                 const AclTensorDescriptor     &dst,
+                                                                 const AclActivationDescriptor &act,
+                                                                 bool                           is_validate)
 {
     TensorInfo src_info = detail::convert_to_legacy_tensor_info(src);
     TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst);
     auto       info     = detail::convert_to_activation_info(act);
 
-    if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info)))
+    if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false),
+                                                                         &dst_info.set_is_resizable(false), info)))
     {
         return std::make_tuple(nullptr, StatusCode::UnsupportedConfig);
     }
@@ -68,7 +74,7 @@
     act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info);
 
     auto op = new arm_compute::IOperator(static_cast<IContext *>(this));
-    if(op == nullptr)
+    if (op == nullptr)
     {
         ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources");
         return std::make_tuple(nullptr, StatusCode::OutOfMemory);

diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h
index 348dc27..4f25bb5 100644
--- a/src/gpu/cl/operators/ClActivation.h
+++ b/src/gpu/cl/operators/ClActivation.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ACTIVATION_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -43,7 +44,10 @@
      * @param[out] dst             Destination tensor info. Data type supported: same as @p src
      * @param[in]  activation_info Activation layer parameters.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info);
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &activation_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClActivation::configure()

diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp
index b9bf505..b58d0df 100644
--- a/src/gpu/cl/operators/ClAdd.cpp
+++ b/src/gpu/cl/operators/ClAdd.cpp

@@ -23,17 +23,20 @@
  */
 #include "src/gpu/cl/operators/ClAdd.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void ClAdd::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
@@ -41,8 +44,11 @@
     _kernel = std::move(k);
 }
 
-Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status ClAdd::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info);
 }

diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h
index a17ce7b..7aed902 100644
--- a/src/gpu/cl/operators/ClAdd.h
+++ b/src/gpu/cl/operators/ClAdd.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ADD_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -65,7 +66,11 @@
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -73,7 +78,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl

diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp
index 05ea21b..8f26ef0 100644
--- a/src/gpu/cl/operators/ClCast.cpp
+++ b/src/gpu/cl/operators/ClCast.cpp

@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClCast.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCastKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
+void ClCast::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       ConvertPolicy           policy)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, policy);
     auto k = std::make_unique<kernels::ClCastKernel>();

diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h
index 1b67ff7..25d2293 100644
--- a/src/gpu/cl/operators/ClCast.h
+++ b/src/gpu/cl/operators/ClCast.h

@@ -58,7 +58,8 @@
      * @param[out] dst             The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32.
      * @param[in]  policy          Conversion policy.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
+    void
+    configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCast::configure()

diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp
index a27fc37..31018b9 100644
--- a/src/gpu/cl/operators/ClConcatenate.cpp
+++ b/src/gpu/cl/operators/ClConcatenate.cpp

@@ -23,9 +23,14 @@
  */
 #include "src/gpu/cl/operators/ClConcatenate.h"
 
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h"
 #include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h"
 #include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h"
@@ -33,42 +38,39 @@
 #include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h"
 #include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h"
 
-#include "arm_compute/core/Error.h"
-#include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Types.h"
-
-#include "src/common/utils/Log.h"
-#include "src/core/helpers/AutoConfiguration.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis)
+void ClConcatenate::configure(const CLCompileContext           &compile_context,
+                              const std::vector<ITensorInfo *> &src_vector,
+                              ITensorInfo                      *dst,
+                              size_t                            axis)
 {
     ARM_COMPUTE_ERROR_ON(dst == nullptr);
     ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis);
     _axis       = axis;
     _num_inputs = src_vector.size();
 
-    TensorShape                      dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
+    TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis);
     std::vector<const ITensorInfo *> const_src_vector(src_vector.size());
-    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t;
-    });
+    std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(),
+                   [](ITensorInfo *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t;
+                   });
 
     // dst auto inizialitation if not yet initialized
     auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type());
     ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis));
 
     unsigned int offset = 0;
-    switch(_axis)
+    switch (_axis)
     {
         case Window::DimX:
         {
-            switch(_num_inputs)
+            switch (_num_inputs)
             {
                 case 2:
                 {
@@ -82,14 +84,15 @@
                 {
                     // Configure WidthConcatenate4Tensors kernel
                     auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>();
-                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst);
+                    kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2),
+                                      src_vector.at(3), dst);
                     _concat_kernels.emplace_back(std::move(kernel));
                     break;
                 }
                 default:
                 {
                     // Configure generic case WidthConcatenate kernels
-                    for(unsigned int i = 0; i < _num_inputs; ++i)
+                    for (unsigned int i = 0; i < _num_inputs; ++i)
                     {
                         auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>();
                         kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -103,7 +106,7 @@
         }
         case Window::DimY:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -114,7 +117,7 @@
         }
         case Window::DimZ:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -125,7 +128,7 @@
         }
         case 3:
         {
-            for(unsigned int i = 0; i < _num_inputs; ++i)
+            for (unsigned int i = 0; i < _num_inputs; ++i)
             {
                 auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>();
                 kernel->configure(compile_context, src_vector.at(i), offset, dst);
@@ -148,25 +151,27 @@
     ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2);
 
     unsigned int offset = 0;
-    switch(axis)
+    switch (axis)
     {
         case Window::DimX:
         {
-            switch(num_inputs)
+            switch (num_inputs)
             {
                 case 2:
                     // Validate WidthConcatenate2Tensors kernels if there are 2 inputs
                     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
+                    ARM_COMPUTE_RETURN_ON_ERROR(
+                        kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst));
                     break;
                 case 4:
                     // Validate WidthConcatenate4Tensors kernels if there are 4 inputs
                     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]);
-                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
+                    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(
+                        src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst));
                     break;
                 default:
                     // Validate generic case of WidthConcatenate kernel
-                    for(const auto &src : src_vector)
+                    for (const auto &src : src_vector)
                     {
                         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
                         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst));
@@ -178,7 +183,7 @@
         }
         case Window::DimY:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -187,7 +192,7 @@
         }
         case Window::DimZ:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -196,7 +201,7 @@
         }
         case 3:
         {
-            for(const auto &src : src_vector)
+            for (const auto &src : src_vector)
             {
                 ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst));
                 offset += src->dimension(axis);
@@ -207,7 +212,7 @@
             ARM_COMPUTE_ERROR("Axis not supported");
     }
 
-    if(dst->total_size() != 0)
+    if (dst->total_size() != 0)
     {
         TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis);
         ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size());
@@ -218,17 +223,17 @@
 
 void ClConcatenate::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }
 
-    if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
+    if (static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs))
     {
         ARM_COMPUTE_ERROR("Configured with different number of inputs");
     }
 
-    if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
+    if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4))
     {
         ARM_COMPUTE_ERROR_ON(_concat_kernels.empty());
         CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true);
@@ -236,7 +241,7 @@
     else
     {
         int i = 0;
-        for(auto &k : _concat_kernels)
+        for (auto &k : _concat_kernels)
         {
             ITensorPack pack;
             pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i));

diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h
index de0cf84..d8ce9d2 100644
--- a/src/gpu/cl/operators/ClConcatenate.h
+++ b/src/gpu/cl/operators/ClConcatenate.h

@@ -57,7 +57,10 @@
      * @param[out]    dst             Destination tensor info. Data types supported: same as @p src_vector.
      * @param[in]     axis            Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3.
      */
-    void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis);
+    void configure(const ClCompileContext           &compile_context,
+                   const std::vector<ITensorInfo *> &src_vector,
+                   ITensorInfo                      *dst,
+                   size_t                            axis);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConcatenate::configure()
@@ -71,8 +74,8 @@
 
 private:
     std::vector<std::unique_ptr<IClKernel>> _concat_kernels{};
-    unsigned int                            _num_inputs{ 0 };
-    unsigned int                            _axis{ 0 };
+    unsigned int                            _num_inputs{0};
+    unsigned int                            _axis{0};
 };
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp
index eb9475c..2c3b021 100644
--- a/src/gpu/cl/operators/ClConv2d.cpp
+++ b/src/gpu/cl/operators/ClConv2d.cpp

@@ -23,17 +23,17 @@
  */
 #include "src/gpu/cl/operators/ClConv2d.h"
 
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/operators/ClDirectConv2d.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "src/gpu/cl/operators/ClIndirectConv2d.h"
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
 
-#include "src/common/utils/Log.h"
-
 #include <memory>
 
 namespace
@@ -48,7 +48,7 @@
  */
 size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target)
 {
-    switch(gpu_target)
+    switch (gpu_target)
     {
         case arm_compute::GPUTarget::G76:
         case arm_compute::GPUTarget::G77:
@@ -71,27 +71,33 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClConv2d::ClConv2d()
-    : _operator()
+ClConv2d::ClConv2d() : _operator()
 {
 }
 
 ClConv2d::~ClConv2d() = default;
 
-void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                         const WeightsInfo &weights_info)
+void ClConv2d::configure(const CLCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *weights,
+                         ITensorInfo            *biases,
+                         ITensorInfo            *dst,
+                         const Conv2dInfo       &conv2d_info,
+                         const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target()))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1);
             auto f = std::make_unique<ClWinogradConv2d>();
-            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math);
+            f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info,
+                         conv2d_info.enable_fast_math);
             _operator = std::move(f);
             break;
         }
@@ -125,35 +131,46 @@
     _aux_mem = _operator->workspace();
 }
 
-Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+Status ClConv2d::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv2dInfo  &conv2d_info,
                           const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
+    switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target))
     {
         case ConvolutionMethod::WINOGRAD:
         {
             //Validate Winograd
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClWinogradConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info,
+                                                                   conv2d_info.act_info, conv2d_info.enable_fast_math));
             break;
         }
         case ConvolutionMethod::DIRECT:
         {
             // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClDirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
         case ConvolutionMethod::INDIRECT:
         {
             // Validate indirect convolution layer
-            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
-            ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
+            ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1,
+                                            "Grouping (num_groups != 1) with ClIndirectConv2d is not supported");
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info));
             break;
         }
         case ConvolutionMethod::GEMM:
@@ -170,8 +187,12 @@
     return Status{};
 }
 
-ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                   const WeightsInfo &weights_info, const GPUTarget gpu_target)
+ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src,
+                                                   const ITensorInfo *weights,
+                                                   const ITensorInfo *dst,
+                                                   const Conv2dInfo  &conv2d_info,
+                                                   const WeightsInfo &weights_info,
+                                                   const GPUTarget    gpu_target)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_ERROR_ON_NULLPTR(dst);
@@ -191,20 +212,35 @@
     using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>;
     using ConfigurationMethod      = std::pair<ConvolutionConfiguration, ConvolutionMethod>;
 
-    const std::vector<ConfigurationMethod> known_configs =
-    {
+    const std::vector<ConfigurationMethod> known_configs = {
         // Alexnet
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U),
+                                                     PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
         // VGG16 / VGG19
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT),
+        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U),
+                                                     PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW),
+                            ConvolutionMethod::DIRECT),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 224
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
         // Mobilenet 160
-        ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM),
+        ConfigurationMethod(ConvolutionConfiguration(
+                                Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U),
+                                PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC),
+                            ConvolutionMethod::GEMM),
     };
 
     const auto find_config = [&](ConfigurationMethod c)
@@ -213,76 +249,89 @@
         const PadStrideInfo            info        = std::get<3>(config);
         const DataLayout               data_layout = std::get<4>(config);
 
-        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h))
-               && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right()
-               && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout());
+        return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) &&
+               std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) &&
+               std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) &&
+               info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() &&
+               info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() &&
+               info.stride() == conv_info.stride() && (data_layout == src->data_layout());
     };
 
     std::vector<ConfigurationMethod>::const_iterator found;
-    if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
+    if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end())
     {
         return (*found).second;
     }
 
-    if(dilation != Size2D(1U, 1U))
+    if (dilation != Size2D(1U, 1U))
     {
         return ConvolutionMethod::GEMM;
     }
     else
     {
-        if(src->data_layout() == DataLayout::NCHW)
+        if (src->data_layout() == DataLayout::NCHW)
         {
             // SRGAN
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) &&
+                (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)))
             {
                 return ConvolutionMethod::DIRECT;
             }
-            if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
+            if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) &&
+                (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)))
             {
                 return ConvolutionMethod::FFT;
             }
-            if(src->dimension(idx_c) < 16)
+            if (src->dimension(idx_c) < 16)
             {
                 return ConvolutionMethod::GEMM;
             }
-            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
+            return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))
+                       ? ConvolutionMethod::WINOGRAD
+                       : ConvolutionMethod::GEMM;
         }
         else
         {
-            const bool   is_direct_valid           = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
-            const bool   is_wino_valid             = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
+            const bool is_direct_valid =
+                bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+            const bool is_wino_valid =
+                bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math));
             const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target);
 
             // SRGAN case
-            if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3)
-               && is_direct_valid)
+            if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) &&
+                (conv_info.pad_top() < 3) && is_direct_valid)
             {
                 return ConvolutionMethod::DIRECT;
             }
 
             // Floating-point case: GeMM/Direct/Winograd
-            if(is_data_type_float(src->data_type()))
+            if (is_data_type_float(src->data_type()))
             {
                 // Get dst shape
-                TensorShape   output_shape       = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
-                const bool    is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
-                const bool    is_ifm_ge_8        = src->dimension(idx_c) >= 8;
-                const bool    is_ifm_ge_16       = src->dimension(idx_c) >= 16;
-                const bool    is_ofm_lte_8       = weights->dimension(3U) <= 8;
-                const bool    is_ofm_lt_64       = weights->dimension(3U) < 64;
-                const bool    workload_gte_8192  = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
-                const bool    is_ifm_gt_ofm      = src->dimension(idx_c) > weights->dimension(3U);
-                const bool    is_m_one           = output_shape[1] * output_shape[2] == 1;
-                const bool    is_unit_stride     = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
-                const int32_t kernel_sz          = weights->dimension(idx_w) * weights->dimension(idx_h);
+                TensorShape output_shape =
+                    misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
+                const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) &&
+                                                (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr);
+                const bool is_ifm_ge_8       = src->dimension(idx_c) >= 8;
+                const bool is_ifm_ge_16      = src->dimension(idx_c) >= 16;
+                const bool is_ofm_lte_8      = weights->dimension(3U) <= 8;
+                const bool is_ofm_lt_64      = weights->dimension(3U) < 64;
+                const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192;
+                const bool is_ifm_gt_ofm     = src->dimension(idx_c) > weights->dimension(3U);
+                const bool is_m_one          = output_shape[1] * output_shape[2] == 1;
+                const bool is_unit_stride =
+                    (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1);
+                const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h);
 
                 // Run Winograd if valid and IFM >= 8
-                if(is_wino_valid && is_ifm_ge_8)
+                if (is_wino_valid && is_ifm_ge_8)
                 {
-                    if(is_ofm_lte_8)
+                    if (is_ofm_lte_8)
                     {
-                        if(gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
+                        if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                            get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)
                         {
                             return ConvolutionMethod::WINOGRAD;
                         }
@@ -294,18 +343,19 @@
                 }
 
                 // Direct convolution case
-                if(is_direct_valid)
+                if (is_direct_valid)
                 {
-                    if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
+                    if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 ||
+                         get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD))
                     {
-                        if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
+                        if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm)
                         {
                             return ConvolutionMethod::DIRECT;
                         }
                     }
-                    else if(gpu_target == arm_compute::GPUTarget::G76)
+                    else if (gpu_target == arm_compute::GPUTarget::G76)
                     {
-                        if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
+                        if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16))
                         {
                             return ConvolutionMethod::DIRECT;
                         }
@@ -314,21 +364,24 @@
                     {
                         ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT;
 
-                        const bool is_indirect_valid = bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
+                        const bool is_indirect_valid =
+                            bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info));
 
                         // indirect conv2d should be called when:
                         // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81)
                         // 2- When the kernel size is odd
                         // 3- When the Gpu target is Arm Mali-G77
-                        if(is_indirect_valid)
+                        if (is_indirect_valid)
                         {
                             const bool is_kernel_sz_odd = kernel_sz % 2;
                             const bool is_g77           = gpu_target == GPUTarget::G77;
-                            preferred_conv_method       = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT;
+                            preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77
+                                                        ? ConvolutionMethod::INDIRECT
+                                                        : ConvolutionMethod::DIRECT;
                         }
 
                         // Direct/indirect convolution used for the first layer of the network
-                        if(workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
+                        if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64)
                         {
                             // In general, the question we should ask for the first convolution layer of a model is:
                             // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that
@@ -337,13 +390,13 @@
                             return preferred_conv_method;
                         }
 
-                        if((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
+                        if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16)
                         {
                             return preferred_conv_method;
                         }
 
                         // Direct convolution used for the last layer of the network
-                        if(is_ofm_lte_8)
+                        if (is_ofm_lte_8)
                         {
                             return preferred_conv_method;
                         }

diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h
index c6c366a..0cf3cbc 100644
--- a/src/gpu/cl/operators/ClConv2d.h
+++ b/src/gpu/cl/operators/ClConv2d.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -112,15 +113,24 @@
      * @param[in]  conv2d_info     Contains convolution 2d info described in @ref Conv2dInfo.
      * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d
      *
      * Similar to ClConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *dst,
+                           const Conv2dInfo  &conv2d_info,
                            const WeightsInfo &weights_info = WeightsInfo());
     /** Static function to check if given info will return the convolution called by @ref ClConv2d
      *
@@ -137,11 +147,15 @@
      *
      * @return the Convolution Method Hint
      */
-    static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                                                    const WeightsInfo &weights_info, const GPUTarget gpu_target);
+    static ConvolutionMethod get_convolution_method(const ITensorInfo *src,
+                                                    const ITensorInfo *weights,
+                                                    const ITensorInfo *dst,
+                                                    const Conv2dInfo  &conv2d_info,
+                                                    const WeightsInfo &weights_info,
+                                                    const GPUTarget    gpu_target);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
index 08122b6..cf24c68 100644
--- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp

@@ -23,16 +23,19 @@
  */
 #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context,
+                                               const ITensorInfo      *src,
+                                               ITensorInfo            *dst,
+                                               const TensorShape      &original_src_shape,
+                                               DataLayout              data_layout)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout);
     auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>();
@@ -40,9 +43,12 @@
     _kernel = std::move(k);
 }
 
-Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout)
+Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src,
+                                                const ITensorInfo *dst,
+                                                const TensorShape &original_src_shape,
+                                                DataLayout         data_layout)
 {
     return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
index 2794eb1..c461520 100644
--- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h
+++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h

@@ -43,14 +43,21 @@
      * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer).
      * @param[in] data_layout        The data layout the weights have been trained in.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   const TensorShape      &original_src_shape,
+                   DataLayout              data_layout);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClConvertFullyConnectedWeights::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           const TensorShape &original_src_shape,
+                           DataLayout         data_layout);
 };
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp
index d3b8304..e2be7ce 100644
--- a/src/gpu/cl/operators/ClCopy.cpp
+++ b/src/gpu/cl/operators/ClCopy.cpp

@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClCopy.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCopyKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@
     return kernels::ClCopyKernel::validate(src, dst, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h
index 9b427f9..fe9b58c 100644
--- a/src/gpu/cl/operators/ClCopy.h
+++ b/src/gpu/cl/operators/ClCopy.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_COPY_H
 
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -44,7 +45,10 @@
      * @param[in]  dst_window      (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   Window                 *dst_window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCopy::configure()

diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp
index cef9f14..6313e4f 100644
--- a/src/gpu/cl/operators/ClCrop.cpp
+++ b/src/gpu/cl/operators/ClCrop.cpp

@@ -23,17 +23,22 @@
  */
 #include "src/gpu/cl/operators/ClCrop.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClCropKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void ClCrop::configure(const ClCompileContext &compile_context,
+                       const ITensorInfo      *src,
+                       ITensorInfo            *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
     auto k = std::make_unique<kernels::ClCropKernel>();
@@ -41,9 +46,15 @@
     _kernel = std::move(k);
 }
 
-Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status ClCrop::validate(const ITensorInfo *src,
+                        const ITensorInfo *dst,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
 {
     return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h
index 1cf1c9b..e845cf3 100644
--- a/src/gpu/cl/operators/ClCrop.h
+++ b/src/gpu/cl/operators/ClCrop.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_CROP_H
 
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -49,16 +50,27 @@
      * @param[in]  extrapolation_value Value to be used for values outside of the image. Default is 0.
      * @param[in]  dst_window          Output window to be used in case cropped image is being copied into a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                   Window *dst_window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src,
+                   ITensorInfo            *dst,
+                   Coordinates2D           start,
+                   Coordinates2D           end,
+                   uint32_t                batch_index,
+                   float                   extrapolation_value = 0,
+                   Window                 *dst_window          = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClCrop::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0,
-                           Window *dst_window = nullptr);
+    static Status validate(const ITensorInfo *src,
+                           const ITensorInfo *dst,
+                           Coordinates2D      start,
+                           Coordinates2D      end,
+                           uint32_t           batch_index,
+                           float              extrapolation_value = 0,
+                           Window            *dst_window          = nullptr);
 };
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp
index 0fccab6..eb6f9e7 100644
--- a/src/gpu/cl/operators/ClDequantize.cpp
+++ b/src/gpu/cl/operators/ClDequantize.cpp

@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClDequantizeKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp
index 0215dba..17a196c 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv2d.cpp

@@ -26,6 +26,8 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/kernels/ClActivationKernel.h"
@@ -35,8 +37,6 @@
 #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h"
 #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h"
 
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::cl_direct_conv;
 
 namespace arm_compute
@@ -53,7 +53,8 @@
     return pack;
 }
 
-DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
@@ -65,8 +66,13 @@
 
 } // namespace
 
-void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                               const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void ClDirectConv2d::configure(const CLCompileContext    &compile_context,
+                               ITensorInfo               *src,
+                               ITensorInfo               *weights,
+                               ITensorInfo               *biases,
+                               ITensorInfo               *dst,
+                               const PadStrideInfo       &conv_info,
+                               const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
@@ -75,15 +81,17 @@
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
 
     // Configure direct convolution kernel
-    const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo();
-    auto                      k               = std::make_unique<kernels::ClDirectConv2dKernel>();
+    const ActivationLayerInfo conv2d_act_info =
+        (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info
+                                                                                         : ActivationLayerInfo();
+    auto k = std::make_unique<kernels::ClDirectConv2dKernel>();
     k->set_target(CLScheduler::get().target());
     k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc);
     _direct_conv_kernel = std::move(k);
 
     // Configure border handler
     PixelValue zero_value(0.f);
-    if(is_data_type_quantized_asymmetric(src->data_type()))
+    if (is_data_type_quantized_asymmetric(src->data_type()))
     {
         zero_value = PixelValue(0, src->data_type(), src->quantization_info());
     }
@@ -92,7 +100,7 @@
     _src_border_handler = std::move(b);
 
     // Fused activation is currently supported for NHWC and floating point types
-    if(act_info.enabled() && !conv2d_act_info.enabled())
+    if (act_info.enabled() && !conv2d_act_info.enabled())
     {
         auto a = std::make_unique<kernels::ClActivationKernel>();
         a->configure(compile_context, dst, dst, act_info);
@@ -103,14 +111,19 @@
     CLScheduler::get().tune_kernel_static(*_direct_conv_kernel);
 }
 
-Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+Status ClDirectConv2d::validate(const ITensorInfo         *src,
+                                const ITensorInfo         *weights,
+                                const ITensorInfo         *biases,
+                                const ITensorInfo         *dst,
+                                const PadStrideInfo       &conv_info,
+                                const ActivationLayerInfo &act_info)
 {
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
-    if(act_info.enabled())
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc));
+    if (act_info.enabled())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info));
     }
@@ -124,7 +137,7 @@
     // Run direct convolution
     CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false);
     // Run activation kernel
-    if(_activation_kernel)
+    if (_activation_kernel)
     {
         auto act_pack = select_activation_src_dst(tensors);
         CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false);

diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h
index fedb9e9..0f18490 100644
--- a/src/gpu/cl/operators/ClDirectConv2d.h
+++ b/src/gpu/cl/operators/ClDirectConv2d.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_DIRECT_CONV2D_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -59,7 +60,12 @@
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -67,16 +73,20 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel> _src_border_handler{ nullptr };
-    std::unique_ptr<IClKernel> _activation_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _direct_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel> _src_border_handler{nullptr};
+    std::unique_ptr<IClKernel> _activation_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp
index 5d37f07..b083479 100644
--- a/src/gpu/cl/operators/ClDirectConv3d.cpp
+++ b/src/gpu/cl/operators/ClDirectConv3d.cpp

@@ -24,13 +24,19 @@
 #include "src/gpu/cl/operators/ClDirectConv3d.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/gpu/cl/kernels/ClDirectConv3dKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClDirectConv3d::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+void ClDirectConv3d::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *src0,
+                               const ITensorInfo      *src1,
+                               const ITensorInfo      *src2,
+                               ITensorInfo            *dst,
+                               const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src0);
 
@@ -40,7 +46,11 @@
     _direct_conv3d_kernel = std::move(k);
 }
 
-Status ClDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status ClDirectConv3d::validate(const ITensorInfo *src0,
+                                const ITensorInfo *src1,
+                                const ITensorInfo *src2,
+                                const ITensorInfo *dst,
+                                const Conv3dInfo  &conv3d_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info));
     return Status{};

diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h
index fa58b5a..5fb3246 100644
--- a/src/gpu/cl/operators/ClDirectConv3d.h
+++ b/src/gpu/cl/operators/ClDirectConv3d.h

@@ -67,7 +67,12 @@
      * @param[in]  conv3d_info     Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *src0,
+                   const ITensorInfo      *src1,
+                   const ITensorInfo      *src2,
+                   ITensorInfo            *dst,
+                   const Conv3dInfo       &conv3d_info);
 
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -75,14 +80,18 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info);
+    static Status validate(const ITensorInfo *src0,
+                           const ITensorInfo *src1,
+                           const ITensorInfo *src2,
+                           const ITensorInfo *dst,
+                           const Conv3dInfo  &conv3d_info);
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _direct_conv3d_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _direct_conv3d_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */

diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp
index 32d2b88..1325371 100644
--- a/src/gpu/cl/operators/ClElementwiseOperations.cpp
+++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp

@@ -23,15 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClElementwiseOperations.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseDivision::configure(const ClCompileContext    &compile_context,
+                                      ITensorInfo               *src1,
+                                      ITensorInfo               *src2,
+                                      ITensorInfo               *dst,
+                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -39,12 +42,19 @@
     _kernel = std::move(k);
 }
 
-Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseDivision::validate(const ITensorInfo         *src1,
+                                       const ITensorInfo         *src2,
+                                       const ITensorInfo         *dst,
+                                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info);
 }
 
-void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseMax::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -52,12 +62,19 @@
     _kernel = std::move(k);
 }
 
-Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseMax::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info);
 }
 
-void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseMin::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src1,
+                                 ITensorInfo               *src2,
+                                 ITensorInfo               *dst,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -65,12 +82,19 @@
     _kernel = std::move(k);
 }
 
-Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseMin::validate(const ITensorInfo         *src1,
+                                  const ITensorInfo         *src2,
+                                  const ITensorInfo         *dst,
+                                  const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info);
 }
 
-void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwiseSquaredDiff::configure(const ClCompileContext    &compile_context,
+                                         ITensorInfo               *src1,
+                                         ITensorInfo               *src2,
+                                         ITensorInfo               *dst,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -78,12 +102,19 @@
     _kernel = std::move(k);
 }
 
-Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwiseSquaredDiff::validate(const ITensorInfo         *src1,
+                                          const ITensorInfo         *src2,
+                                          const ITensorInfo         *dst,
+                                          const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info);
 }
 
-void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClElementwisePower::configure(const ClCompileContext    &compile_context,
+                                   ITensorInfo               *src1,
+                                   ITensorInfo               *src2,
+                                   ITensorInfo               *dst,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info);
     auto k = std::make_unique<kernels::ClArithmeticKernel>();
@@ -91,7 +122,10 @@
     _kernel = std::move(k);
 }
 
-Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClElementwisePower::validate(const ITensorInfo         *src1,
+                                    const ITensorInfo         *src2,
+                                    const ITensorInfo         *dst,
+                                    const ActivationLayerInfo &act_info)
 {
     return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info);
 }

diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h
index 120049c..de7c018 100644
--- a/src/gpu/cl/operators/ClElementwiseOperations.h
+++ b/src/gpu/cl/operators/ClElementwiseOperations.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -48,14 +49,21 @@
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseDivision::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max
@@ -74,14 +82,21 @@
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseMax::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min
@@ -100,14 +115,21 @@
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseMin::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference
@@ -126,14 +148,21 @@
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src1.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwiseSquaredDiff::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power
@@ -152,14 +181,21 @@
      * @param[out] dst             Destination tensor info. Data types supported:F16/F32.
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClElementwisePower::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp
index f94d402..9146211 100644
--- a/src/gpu/cl/operators/ClElementwiseUnary.cpp
+++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp

@@ -23,9 +23,8 @@
  */
 #include "src/gpu/cl/operators/ClElementwiseUnary.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp
index ad22b15..817b15a 100644
--- a/src/gpu/cl/operators/ClFill.cpp
+++ b/src/gpu/cl/operators/ClFill.cpp

@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClFill.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFillKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window)
+void ClFill::configure(const ClCompileContext &compile_context,
+                       ITensorInfo            *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window);
     auto k = std::make_unique<kernels::ClFillKernel>();
@@ -45,4 +47,4 @@
     return kernels::ClFillKernel::validate(tensor, constant_value, dst_window);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h
index 3bbe27e..e13862a 100644
--- a/src/gpu/cl/operators/ClFill.h
+++ b/src/gpu/cl/operators/ClFill.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -44,7 +45,10 @@
      * @param[in]     constant_value  The value used to fill the planes of the tensor
      * @param[in]     window          Window to be used in case setting only part of a tensor. Default is nullptr.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *tensor,
+                   const PixelValue       &constant_value,
+                   Window                 *window = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClFill::configure()

diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp
index e277c0d..7532532 100644
--- a/src/gpu/cl/operators/ClFlatten.cpp
+++ b/src/gpu/cl/operators/ClFlatten.cpp

@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClFlatten.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl

diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp
index 84f685e..6790160 100644
--- a/src/gpu/cl/operators/ClFloor.cpp
+++ b/src/gpu/cl/operators/ClFloor.cpp

@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClFloor.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClFloorKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl

diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp
index 5845bbc..6969ac8 100644
--- a/src/gpu/cl/operators/ClFullyConnected.cpp
+++ b/src/gpu/cl/operators/ClFullyConnected.cpp

@@ -24,12 +24,13 @@
 #include "src/gpu/cl/operators/ClFullyConnected.h"
 
 #include "arm_compute/core/Size2D.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
+#include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 #include "src/gpu/cl/operators/ClFlatten.h"
@@ -38,11 +39,8 @@
 #include "src/gpu/cl/operators/ClMatMul.h"
 #include "src/gpu/cl/operators/ClTranspose.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h"
 #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 #include <algorithm>
@@ -62,8 +60,11 @@
     return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation
 }
 
-Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst,
-                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo       &src,
+                                       const ITensorInfo       &weights,
+                                       const ITensorInfo       &dst,
+                                       GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                       ActivationLayerInfo      activation_info)
 {
     gemmlowp_output_stage.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
     gemmlowp_output_stage.gemmlowp_offset     = 0;
@@ -73,7 +74,7 @@
     const auto data_type = src.data_type();
 
     // Configure output stage for quantized case
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         const QuantizationInfo        oq_info = dst.quantization_info();
         const UniformQuantizationInfo iq_unif = src.quantization_info().uniform();
@@ -85,15 +86,17 @@
         const float multiplier        = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale;
         int         output_multiplier = 0;
         int         output_shift      = 0;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         PixelValue type_min{};
         PixelValue type_max{};
         std::tie(type_min, type_max) = get_min_max(data_type);
 
-        if(activation_info.enabled())
+        if (activation_info.enabled())
         {
-            std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
+            std::tie(type_min, type_max) =
+                get_quantized_activation_min_max(activation_info, data_type, output_quant_info);
         }
 
         // Set the GEMMLowp output stage info
@@ -109,31 +112,41 @@
     return Status{};
 }
 
-Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info, bool use_matmul)
+Status validate_mm(const ITensorInfo             &src,
+                   const ITensorInfo             &weights,
+                   const ITensorInfo             *bias,
+                   const ITensorInfo             &dst,
+                   const FullyConnectedLayerInfo &fc_info,
+                   bool                           use_matmul)
 {
     // Note : If input is dynamic and data is not batched, use matmul, else use gemm
     const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
-    const bool use_dynamic_gemm  = !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
-    const bool is_quantized      = is_data_type_quantized_asymmetric(src.data_type());
+    const bool use_dynamic_gemm =
+        !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul
+    const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type());
 
-    if(use_matmul)
+    if (use_matmul)
     {
         const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights);
 
         // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1]
         TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape()));
 
-        const GPUTarget                                         gpu_target  = CLScheduler::get().target();
-        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t           = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-        const MatMulKernelInfo                                  kernel_info = t->configure(&lhs_to_use, &weights, m_info);
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info);
 
-        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info) :
-               kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info);
+        return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst,
+                                                                          kernel_info, fc_info.activation_info)
+                            : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info,
+                                                                      fc_info.activation_info);
     }
     else
     {
         GEMMLowpOutputStageInfo gemmlowp_output_stage;
-        ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info));
 
         const GEMMInfo &gemm_info = GEMMInfo(false,                           // is_a_reshaped
                                              false,                           // is_b_reshaped
@@ -147,7 +160,7 @@
                                              true,                            // broadcast_bias
                                              ActivationLayerInfo());          // activation_info
 
-        if(is_quantized)
+        if (is_quantized)
         {
             const UniformQuantizationInfo iq_info = src.quantization_info().uniform();
             const UniformQuantizationInfo wq_info = weights.quantization_info().uniform();
@@ -158,11 +171,9 @@
             const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset);
 
             // Validate gemmlowp function
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info),
-                                                                               &weights.clone()->set_quantization_info(weights_quantization_info),
-                                                                               bias,
-                                                                               &dst,
-                                                                               gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(
+                &src.clone()->set_quantization_info(src_quantization_info),
+                &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info));
         }
         else
         {
@@ -188,11 +199,15 @@
 
 ClFullyConnected::~ClFullyConnected() = default;
 
-void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_mm(const CLCompileContext        &compile_context,
+                                    ITensorInfo                   *src,
+                                    ITensorInfo                   *weights,
+                                    ITensorInfo                   *bias,
+                                    ITensorInfo                   *dst,
                                     const FullyConnectedLayerInfo &fc_info)
 {
     // If weights are dynamic and matmul is supported use matmul, else use gemm
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Specify whether transpose weights is necessary in matmul info
         const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights);
@@ -202,22 +217,25 @@
         _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape()));
 
         // 2. Use heuristics to get kernel info object
-        const GPUTarget                                         gpu_target    = CLScheduler::get().target();
-        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
-        MatMulKernelInfo                                        kernel_info   = kernel_config->configure(src, weights, mat_info);
+        const GPUTarget                                         gpu_target = CLScheduler::get().target();
+        std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config =
+            cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target);
+        MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info);
 
         // 3. Configure relevant matmul kernel
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>();
             _matmul_lowp_native_kernel->set_target(gpu_target);
-            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info);
+            _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                                  fc_info.activation_info);
         }
         else
         {
             _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>();
             _matmul_native_kernel->set_target(gpu_target);
-            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info);
+            _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info,
+                                             fc_info.activation_info);
         }
     }
     else
@@ -238,7 +256,7 @@
                                              true,                            // broadcast_bias
                                              fc_info.activation_info);        // activation_info
 
-        if(_is_quantized)
+        if (_is_quantized)
         {
             // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
             // Extract and negate input and weights offset
@@ -248,8 +266,10 @@
             TensorInfo src_info     = src->clone()->set_quantization_info(src_quantization_info);
             TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info);
 
-            src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
-            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+            src_info.set_quantization_info(
+                QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset));
+            weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale,
+                                                                -weights_quantization_info.uniform().offset));
 
             // Configure gemmlowp function
             _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
@@ -264,16 +284,25 @@
     }
 }
 
-void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_conv_fc(const CLCompileContext        &compile_context,
+                                         ITensorInfo                   *src,
+                                         ITensorInfo                   *weights,
+                                         ITensorInfo                   *bias,
+                                         ITensorInfo                   *dst,
                                          const FullyConnectedLayerInfo &fc_info)
 {
     // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
-    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+    ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) !=
+                          (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
     // If the fully connected layer is called after a convolution layer, the input tensor must be linearized
 
     // Initialize output tensor for flatten
-    _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW);
+    _flattened_src = src->clone()
+                         ->set_is_resizable(true)
+                         .reset_padding()
+                         .set_tensor_shape(compute_flatten_shape(src))
+                         .set_data_layout(DataLayout::NCHW);
 
     // Configure flatten kernel
     _flatten = std::make_unique<ClFlatten>();
@@ -284,7 +313,11 @@
     configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info);
 }
 
-void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst,
+void ClFullyConnected::configure_fc_fc(const CLCompileContext        &compile_context,
+                                       ITensorInfo                   *src,
+                                       ITensorInfo                   *weights,
+                                       ITensorInfo                   *bias,
+                                       ITensorInfo                   *dst,
                                        const FullyConnectedLayerInfo &fc_info)
 {
     // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate.
@@ -294,7 +327,11 @@
     configure_mm(compile_context, src, weights, bias, dst, fc_info);
 }
 
-void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+void ClFullyConnected::configure(const CLCompileContext &compile_context,
+                                 ITensorInfo            *src,
+                                 ITensorInfo            *weights,
+                                 ITensorInfo            *biases,
+                                 ITensorInfo            *dst,
                                  FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
@@ -317,8 +354,9 @@
     // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched.
     // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required)
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    _use_matmul                    = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
-    _dynamic_gemm                  = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
+    _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer &&
+                  !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+    _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul;
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -327,11 +365,11 @@
     //  4) Fully Connected layer -> Fully Connected layer with batches
 
     // Check if we have a fully connected layer with batches
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                  src->tensor_shape().cend(),
-                                                                                  dst->tensor_shape().cbegin() + 1));
+        _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                            (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                        dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -341,7 +379,7 @@
     ITensorInfo *weights_used = weights;
 
     // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op.
-    if(_transpose_weights && !_use_matmul)
+    if (_transpose_weights && !_use_matmul)
     {
         // Reshape the weights
         _reshape_weights = std::make_unique<ClTranspose>();
@@ -351,14 +389,11 @@
     }
 
     // Convert weights if needed
-    if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Convert weights
         _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>();
-        _convert_weights->configure(compile_context,
-                                    weights_used,
-                                    &_converted_weights,
-                                    src->tensor_shape(),
+        _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(),
                                     fc_info.weights_trained_layout);
 
         weights_used         = &_converted_weights;
@@ -366,7 +401,7 @@
         _run_convert_weights = true;
     }
 
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info);
@@ -379,60 +414,69 @@
     // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion)
     _weights_to_use = *weights_used;
 
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem
-        _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
+        _aux_mem[ConvertedWeights] =
+            MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size());
     }
     else
     {
         // Set auxiliary memory requirements for gemm operators
         auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace();
-        for(unsigned int i = 0; i < gemm_mem_req.size(); ++i)
+        for (unsigned int i = 0; i < gemm_mem_req.size(); ++i)
         {
             _aux_mem[i] = gemm_mem_req[i];
         }
-        if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
+        if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs
         {
             // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
             // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time
             _aux_mem[TransposedWeights] = MemoryInfo(
-                                              offset_int_vec(TransposedWeights),
-                                              _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-                                              _reshaped_weights.total_size());
-            _aux_mem[ConvertedWeights] = MemoryInfo(
-                                             offset_int_vec(ConvertedWeights),
-                                             _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
-                                             _converted_weights.total_size());
+                offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare,
+                                                    _converted_weights.total_size());
         }
         else
         {
             // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch
-            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
-            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare;
+            const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
+            const auto converted_wei_lft  = (_weights_to_use_idx == offset_int_vec(ConvertedWeights))
+                                                ? MemoryLifetime::Persistent
+                                                : MemoryLifetime::Prepare;
 
-            _aux_mem[TransposedWeights] = MemoryInfo(
-                                              offset_int_vec(TransposedWeights),
-                                              _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
-                                              _reshaped_weights.total_size());
-            _aux_mem[ConvertedWeights] = MemoryInfo(
-                                             offset_int_vec(ConvertedWeights),
-                                             _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
-                                             _converted_weights.total_size());
+            _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights),
+                                                     _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft,
+                                                     _reshaped_weights.total_size());
+            _aux_mem[ConvertedWeights]  = MemoryInfo(offset_int_vec(ConvertedWeights),
+                                                    _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft,
+                                                     _converted_weights.total_size());
         }
     }
-    _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
+    _aux_mem[FlattenedSrc] =
+        MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size());
 }
 
-Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+Status ClFullyConnected::validate(const ITensorInfo      *src,
+                                  const ITensorInfo      *weights,
+                                  const ITensorInfo      *biases,
+                                  const ITensorInfo      *dst,
                                   FullyConnectedLayerInfo fc_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU
-                                && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU &&
+        fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
     const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target());
 
     const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false;
@@ -441,11 +485,20 @@
     // When using dynamic weights - use matmul kernels.
     // Note: MatMul does not support broadcasting so fallback with batched cases.
     const bool is_batched_fc_layer = dst->dimension(1) > 1;
-    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
+    const bool use_matmul          = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() &&
+                            !is_batched_fc_layer &&
+                            !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout));
 
-    const ITensorInfo &flatten_src       = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW));
-    const ITensorInfo &reshaped_weights  = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
-    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) ? TensorInfo(*reshaped_weights.clone()) : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
+    const ITensorInfo &flatten_src      = TensorInfo(src->clone()
+                                                         ->set_is_resizable(true)
+                                                         .reset_padding()
+                                                         .set_tensor_shape(compute_flatten_shape(src))
+                                                         .set_data_layout(DataLayout::NCHW));
+    const ITensorInfo &reshaped_weights = TensorInfo(
+        weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights)));
+    const ITensorInfo &converted_weights = (transpose_weights && !use_matmul)
+                                               ? TensorInfo(*reshaped_weights.clone())
+                                               : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding());
 
     // With the Fully Connected layer we can have 4 different cases:
     //  1) Convolution layer -> Fully Connected layer without batches
@@ -456,10 +509,10 @@
     const ITensorInfo *src_to_use     = src;
     const ITensorInfo *weights_to_use = weights;
 
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-        if(is_data_type_quantized(src->data_type()))
+        if (is_data_type_quantized(src->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -470,11 +523,11 @@
     }
 
     // Check if FC is after conv (flatten kernel is run in case where FC is after conv.)
-    if(is_batched_fc_layer)
+    if (is_batched_fc_layer)
     {
-        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3,
-                                                                                 src->tensor_shape().cend(),
-                                                                                 dst->tensor_shape().cbegin() + 1));
+        is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) &&
+                           (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(),
+                                       dst->tensor_shape().cbegin() + 1));
     }
     else
     {
@@ -482,29 +535,28 @@
     }
 
     // Transpose kernel does not run when matmul is supported as matmul fuses transpose op.
-    if(transpose_weights && !use_matmul)
+    if (transpose_weights && !use_matmul)
     {
         // Validate reshape weights kernel
         ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights));
         weights_to_use = &reshaped_weights;
     }
 
-    if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
+    if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout))
     {
         // Validate convert weights kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use,
-                                                                             &converted_weights,
-                                                                             src->tensor_shape(),
-                                                                             fc_info.weights_trained_layout));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(
+            weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout));
         weights_to_use = &converted_weights;
     }
 
-    if(is_fc_after_conv)
+    if (is_fc_after_conv)
     {
         // Fully Connected layer after a Convolution Layer without batches
         // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled
         const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1;
-        ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
+        ARM_COMPUTE_RETURN_ERROR_ON(
+            (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2))));
 
         // Validate flatten kernel
         ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src));
@@ -539,24 +591,24 @@
     CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false);
 
     // Linearize input if it comes from a convolutional layer
-    if(_is_fc_after_conv)
+    if (_is_fc_after_conv)
     {
-        ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } };
+        ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}};
         _flatten->run(flatten_pack);
     }
 
     ITensorPack gemm_pack = tensors;
     gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src);
-    if(_weights_to_use_idx != ACL_SRC_1)
+    if (_weights_to_use_idx != ACL_SRC_1)
     {
         gemm_pack.add_const_tensor(ACL_SRC_1, weights.get());
     }
 
     // Run MatMul Op
-    if(_use_matmul)
+    if (_use_matmul)
     {
         // Run matmul kernels for matrix multiplication
-        if(_is_quantized)
+        if (_is_quantized)
         {
             CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true);
         }
@@ -568,7 +620,7 @@
     else
     {
         // Run matrix multiply
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _mm_gemmlowp->run(gemm_pack);
         }
@@ -582,7 +634,7 @@
 void ClFullyConnected::prepare(ITensorPack &tensors)
 {
     // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed.
-    if(!_is_prepared || _dynamic_gemm)
+    if (!_is_prepared || _dynamic_gemm)
     {
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
         ++_asrt_prepare_count;
@@ -598,10 +650,10 @@
         const ITensor *cur_weights = weights;
 
         // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose.
-        if(_transpose_weights && !_use_matmul)
+        if (_transpose_weights && !_use_matmul)
         {
             // Run reshape weights kernel and mark weights as unused
-            ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } };
+            ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}};
             _reshape_weights->run(transpose_pack);
 
             cur_weights->mark_as_unused();
@@ -609,9 +661,9 @@
         }
 
         // Convert weights if needed
-        if(_run_convert_weights)
+        if (_run_convert_weights)
         {
-            ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } };
+            ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}};
             _convert_weights->run(convert_pack);
 
             cur_weights->mark_as_unused();
@@ -622,9 +674,9 @@
         gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights);
 
         // Prepare GEMM prepare and release unused weights
-        if(_dynamic_gemm || !_use_matmul)
+        if (_dynamic_gemm || !_use_matmul)
         {
-            if(!_is_quantized)
+            if (!_is_quantized)
             {
                 _mm_gemm->prepare(gemm_pack);
             }

diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h
index d975859..0621238 100644
--- a/src/gpu/cl/operators/ClFullyConnected.h
+++ b/src/gpu/cl/operators/ClFullyConnected.h

@@ -47,7 +47,7 @@
 {
 class ClMatMulNativeKernel;
 class ClMatMulLowpNativeKernel;
-}
+} // namespace kernels
 /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels:
  *
  *  -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer)
@@ -88,7 +88,11 @@
      *                             Data type supported: Same as @p src.
      * @param[in]  fc_info         (Optional) Fully connected layer additional info
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
                    FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -96,18 +100,36 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *weights,
+                           const ITensorInfo      *biases,
+                           const ITensorInfo      *dst,
                            FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo());
 
     // Inherited methods overriden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
-    void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info);
+    void configure_fc_fc(const CLCompileContext        &compile_context,
+                         ITensorInfo                   *src,
+                         ITensorInfo                   *weights,
+                         ITensorInfo                   *bias,
+                         ITensorInfo                   *dst,
+                         const FullyConnectedLayerInfo &fc_info);
+    void configure_conv_fc(const CLCompileContext        &compile_context,
+                           ITensorInfo                   *src,
+                           ITensorInfo                   *weights,
+                           ITensorInfo                   *bias,
+                           ITensorInfo                   *dst,
+                           const FullyConnectedLayerInfo &fc_info);
+    void configure_mm(const CLCompileContext        &compile_context,
+                      ITensorInfo                   *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *bias,
+                      ITensorInfo                   *dst,
+                      const FullyConnectedLayerInfo &fc_info);
 
 private:
     enum AuxTensorIdx
@@ -134,19 +156,19 @@
     TensorInfo _reshaped_weights{};
     TensorInfo _lhs_to_use{};
     TensorInfo _weights_to_use{};
-    int        _weights_to_use_idx{ ACL_SRC_1 };
+    int        _weights_to_use_idx{ACL_SRC_1};
 
-    bool _run_convert_weights{ false };
-    bool _transpose_weights{ false };
-    bool _dynamic_gemm{ false };
-    bool _use_matmul{ false };
+    bool _run_convert_weights{false};
+    bool _transpose_weights{false};
+    bool _dynamic_gemm{false};
+    bool _use_matmul{false};
 
-    bool _is_fc_after_conv{ true };
-    bool _is_quantized{ false };
-    bool _is_prepared{ false };
+    bool _is_fc_after_conv{true};
+    bool _is_quantized{false};
+    bool _is_prepared{false};
 
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-    int _asrt_run_count {};
+    int _asrt_run_count{};
     int _asrt_prepare_count{};
 #endif // ARM_COMPUTE_ASSERTS_ENABLED
 };

diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp
index 7e331a8..815c254 100644
--- a/src/gpu/cl/operators/ClGemm.cpp
+++ b/src/gpu/cl/operators/ClGemm.cpp

@@ -33,11 +33,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/ITensorAllocator.h"
 
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/utils/helpers/float_ops.h"
@@ -45,8 +46,6 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h"
 #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 #include "utils/TypePrinter.h"
 
@@ -67,35 +66,43 @@
     return kernel_type == CLGEMMKernelType::NATIVE ? false : true;
 }
 //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type
-inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
+inline CLGEMMKernelType
+auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights)
 {
-    if(!constant_weights)
+    if (!constant_weights)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
     auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
+    if (bool(gemm_kernel))
     {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
             return gemm_kernel.gemm_type;
         }
     }
     gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
     return gemm_kernel.gemm_type;
 }
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                                    const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *c,
+                                                    const ITensorInfo       *output,
+                                                    GEMMKernelInfo           gemm_kernel_info)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -103,12 +110,14 @@
     gemm_kernel_info.lhs_info  = lhs_info;
     gemm_kernel_info.rhs_info  = rhs_info;
     gemm_kernel_info.has_pad_y = false;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
     {
         return false;
     }
     gemm_kernel_info.has_pad_y = true;
-    if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                                  rhs_info, gemm_kernel_info)))
     {
         return false;
     }
@@ -116,49 +125,65 @@
 }
 
 //Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a,
-                                                                                                 const ITensorInfo *b,
-                                                                                                 const ITensorInfo *c, const ITensorInfo *output)
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          GEMMKernelInfo               kernel_info,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *c,
+                                          const ITensorInfo           *output)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Validate lhs_info and rhs_info for reshaped kernel
-inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c,
-                                           const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d)
+inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info,
+                                           const GEMMRHSMatrixInfo &rhs_info,
+                                           const ITensorInfo       *a,
+                                           const ITensorInfo       *b,
+                                           const ITensorInfo       *c,
+                                           const ITensorInfo       *output,
+                                           GEMMKernelInfo           gemm_kernel_info,
+                                           bool                     reinterpret_input_as_3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel
     TensorInfo tmp_a_info{};
     TensorInfo tmp_b_info{};
 
     // Validate reshape LHS kernel
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
-    if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
+    auto_init_if_empty(tmp_a_info,
+                       a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d)));
+    if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d)))
     {
         return false;
     }
 
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
     // Validate mm kernel
     gemm_kernel_info.lhs_info = lhs_info;
     gemm_kernel_info.rhs_info = rhs_info;
-    if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info)))
+    if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info,
+                                                           rhs_info, gemm_kernel_info)))
     {
         return false;
     }
@@ -166,21 +191,32 @@
 }
 
 //Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs
-inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b,
-                                                                                        const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d)
+inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query,
+                                 GEMMKernelInfo               kernel_info,
+                                 const ITensorInfo           *a,
+                                 const ITensorInfo           *b,
+                                 const ITensorInfo           *c,
+                                 const ITensorInfo           *output,
+                                 bool                         reinterpret_input_as_3d)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d))
+        if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info,
+                                           reinterpret_input_as_3d))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
+        to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 } // namespace
 
@@ -200,18 +236,24 @@
 {
 }
 
-void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                              const GEMMInfo &gemm_info)
+void ClGemm::configure_native(const CLCompileContext &compile_context,
+                              ITensorInfo            *a,
+                              ITensorInfo            *b,
+                              ITensorInfo            *c,
+                              ITensorInfo            *output,
+                              float                   alpha,
+                              float                   beta,
+                              const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -225,24 +267,32 @@
     // Set the target for the kernels
     _mm_native_kernel->set_target(gpu_target);
 
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
 
     // Configure and tune matrix multiply kernel
-    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info);
+    _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info,
+                                 kernel_info);
 }
 
-void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped(const CLCompileContext &compile_context,
+                                ITensorInfo            *a,
+                                ITensorInfo            *b,
+                                ITensorInfo            *c,
+                                ITensorInfo            *output,
+                                float                   alpha,
+                                float                   beta,
+                                const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -261,32 +311,42 @@
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b,
-                                                                    c, output, gemm_info.reinterpret_input_as_3d());
+    std::tie(lhs_info, rhs_info) =
+        auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size},
+                                         kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d());
 
     _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d());
     _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
 
     // Configure and tune matrix multiply kernel
-    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                   kernel_info);
 
     // Request memory for LHS and RHS reshape matrix
     _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size());
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                         const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                         ITensorInfo            *a,
+                                         ITensorInfo            *b,
+                                         ITensorInfo            *c,
+                                         ITensorInfo            *output,
+                                         float                   alpha,
+                                         float                   beta,
+                                         const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -304,7 +364,8 @@
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output);
+    std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output);
 
     // Transpose matrix
     _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info);
@@ -315,24 +376,33 @@
 
     // Configure matrix multiply kernel with no y padding support
     kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info,
+                                            kernel_info);
 
     // Request memory for RHS reshape matrix
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta,
-                                              const GEMMInfo &gemm_info)
+void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                              ITensorInfo            *a,
+                                              ITensorInfo            *b,
+                                              ITensorInfo            *c,
+                                              ITensorInfo            *output,
+                                              float                   alpha,
+                                              float                   beta,
+                                              const GEMMInfo         &gemm_info)
 {
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const GPUTarget    gpu_target              = CLScheduler::get().target();
-    bool               broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const GPUTarget    gpu_target          = CLScheduler::get().target();
+    bool               broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -350,9 +420,10 @@
     GEMMRHSMatrixInfo rhs_info{};
 
     // Pick up the GEMM configuration
-    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info         = gemm_config.lhs_info;
-    rhs_info         = gemm_config.rhs_info;
+    auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
     // Force H0 to 4 in order to use the MMUL extension
     rhs_info.h0 = 4;
 
@@ -361,13 +432,22 @@
 
     // Configure matrix multiply kernel with no y padding support
     kernel_info.has_pad_y = false;
-    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info);
+    _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info,
+                                                 rhs_info, kernel_info);
 
     // Request memory for RHS reshape matrix
-    _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+    _aux_mem[RhsReshape] = MemoryInfo(
+        offset_int_vec(RhsReshape),
+        _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
 }
 
-Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_native(const ITensorInfo *a,
+                               const ITensorInfo *b,
+                               const ITensorInfo *c,
+                               const ITensorInfo *output,
+                               float              alpha,
+                               float              beta,
+                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -376,12 +456,12 @@
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -392,15 +472,23 @@
     kernel_info.broadcast_bias          = broadcast_bias;
     kernel_info.activation_info         = gemm_info.activation_info();
 
-    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
+    auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(
+        a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped(const ITensorInfo *a,
+                                 const ITensorInfo *b,
+                                 const ITensorInfo *c,
+                                 const ITensorInfo *output,
+                                 float              alpha,
+                                 float              beta,
+                                 const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -412,12 +500,12 @@
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     DataType           data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -433,23 +521,33 @@
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config =
+        select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
 
-    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
+    auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(
+                                       compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d()));
 
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
     ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha,
+                                                                             beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a,
+                                          const ITensorInfo *b,
+                                          const ITensorInfo *c,
+                                          const ITensorInfo *output,
+                                          float              alpha,
+                                          float              beta,
+                                          const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -460,12 +558,12 @@
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     const DataType     data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -481,24 +579,33 @@
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
 
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
     ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info));
 
     // Validate matrix multiply
     kernel_info.has_pad_y = false;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     kernel_info.has_pad_y = true;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                               const ITensorInfo *b,
+                                               const ITensorInfo *c,
+                                               const ITensorInfo *output,
+                                               float              alpha,
+                                               float              beta,
+                                               const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha);
     ARM_COMPUTE_UNUSED(output);
@@ -508,12 +615,12 @@
     const GPUTarget    gpu_target              = CLScheduler::get().target();
     const DataType     data_type               = a->data_type();
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
-    const bool         broadcast_bias          = gemm_info.broadcast_bias();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
+    const bool         broadcast_bias      = gemm_info.broadcast_bias();
 
     GEMMKernelInfo kernel_info;
     kernel_info.m                       = m;
@@ -529,9 +636,10 @@
 
     // Pick up the GEMM configuration
     // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
-    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size });
-    lhs_info               = gemm_config.lhs_info;
-    rhs_info               = gemm_config.rhs_info;
+    const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(
+        auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size});
+    lhs_info = gemm_config.lhs_info;
+    rhs_info = gemm_config.rhs_info;
     // Force H0 to 4 in order to use the MMUL extension
     rhs_info.h0 = 4;
 
@@ -540,12 +648,20 @@
 
     // Validate matrix multiply
     kernel_info.has_pad_y = false;
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(
+        a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info));
 
     return Status{};
 }
 
-void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void ClGemm::configure(const CLCompileContext &compile_context,
+                       ITensorInfo            *a,
+                       ITensorInfo            *b,
+                       ITensorInfo            *c,
+                       ITensorInfo            *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -558,20 +674,21 @@
     _is_prepared                 = gemm_info.retain_internal_weights();
 
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
 
     // Select GEMMType
-    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run,
-                                                b->are_values_constant());
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size},
+        _reshape_b_only_on_first_run, b->are_values_constant());
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
     ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
 
-    switch(_gemm_kernel_type)
+    switch (_gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -600,35 +717,41 @@
     }
 }
 
-Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status ClGemm::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     // Get the GPU target
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
 
     // Check data type early because the auto_select_gemm_kernel has assertions on supported data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16);
 
     // Select GEMMType
-    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery
-    {
-        CLScheduler::get().target(),
-        a->data_type(),
-        m,
-        n,
-        k,
-        batch_size,
-    },
-    gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
+    CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{
+            CLScheduler::get().target(),
+            a->data_type(),
+            m,
+            n,
+            k,
+            batch_size,
+        },
+        gemm_info.reshape_b_only_on_first_run(), b->are_values_constant());
 
     const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr);
 
     const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr;
 
-    switch(gemm_kernel_type)
+    switch (gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -647,7 +770,8 @@
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info));
             break;
         }
         default:
@@ -674,7 +798,7 @@
     prepare(tensors);
 
     // Run matrix multiply kernel
-    switch(_gemm_kernel_type)
+    switch (_gemm_kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         {
@@ -684,13 +808,13 @@
         case CLGEMMKernelType::RESHAPED:
         {
             // Run interleave kernel
-            ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } };
+            ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}};
             CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false);
 
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts
@@ -698,7 +822,7 @@
             gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get());
             gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED)
             {
                 CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true);
             }
@@ -706,10 +830,10 @@
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
         {
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
@@ -722,7 +846,7 @@
             ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
             gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(has_pad_y)
+            if (has_pad_y)
             {
                 ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
@@ -734,10 +858,10 @@
         }
         case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL:
         {
-            if(!_reshape_b_only_on_first_run)
+            if (!_reshape_b_only_on_first_run)
             {
                 // Run transpose kernel
-                ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } };
+                ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}};
                 CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false);
             }
             // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement
@@ -750,7 +874,7 @@
             ITensorPack gemm_reshaped_onlyrhs_pack(tensors);
             gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get());
 
-            if(has_pad_y)
+            if (has_pad_y)
             {
                 ARM_COMPUTE_ERROR_ON(has_pad_y);
             }
@@ -769,20 +893,22 @@
 
 void ClGemm::prepare(ITensorPack &constants)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        const ITensor *src1    = constants.get_const_tensor(ACL_SRC_1);
-        ICLTensor     *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
+        const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1);
+        ICLTensor     *rhs_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape)));
 
         // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed
-        if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
+        if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) &&
+            (src1 != nullptr && rhs_aux != nullptr) && rhs_aux)
         {
             ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!");
 
             CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux);
             ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr);
 
-            ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } };
+            ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}};
             CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true);
         }
         _is_prepared = true;

diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h
index 11f9f2b..85dc1d6 100644
--- a/src/gpu/cl/operators/ClGemm.h
+++ b/src/gpu/cl/operators/ClGemm.h

@@ -90,30 +90,95 @@
      *                             if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping
      *                             in case matrix A and matrix B have been already transformed.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   float                   alpha,
+                   float                   beta,
+                   const GEMMInfo         &gemm_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemm::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           float              alpha,
+                           float              beta,
+                           const GEMMInfo    &gemm_info);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
-    void configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    void configure_native(const CLCompileContext &compile_context,
+                          ITensorInfo            *a,
+                          ITensorInfo            *b,
+                          ITensorInfo            *c,
+                          ITensorInfo            *output,
+                          float                   alpha,
+                          float                   beta,
+                          const GEMMInfo         &gemm_info);
+    void configure_reshaped(const CLCompileContext &compile_context,
+                            ITensorInfo            *a,
+                            ITensorInfo            *b,
+                            ITensorInfo            *c,
+                            ITensorInfo            *output,
+                            float                   alpha,
+                            float                   beta,
+                            const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs(const CLCompileContext &compile_context,
+                                     ITensorInfo            *a,
+                                     ITensorInfo            *b,
+                                     ITensorInfo            *c,
+                                     ITensorInfo            *output,
+                                     float                   alpha,
+                                     float                   beta,
+                                     const GEMMInfo         &gemm_info);
+    void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context,
+                                          ITensorInfo            *a,
+                                          ITensorInfo            *b,
+                                          ITensorInfo            *c,
+                                          ITensorInfo            *output,
+                                          float                   alpha,
+                                          float                   beta,
+                                          const GEMMInfo         &gemm_info);
 
-    static Status validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
-    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info);
+    static Status validate_native(const ITensorInfo *a,
+                                  const ITensorInfo *b,
+                                  const ITensorInfo *c,
+                                  const ITensorInfo *output,
+                                  float              alpha,
+                                  float              beta,
+                                  const GEMMInfo    &gemm_info);
+    static Status validate_reshaped(const ITensorInfo *a,
+                                    const ITensorInfo *b,
+                                    const ITensorInfo *c,
+                                    const ITensorInfo *output,
+                                    float              alpha,
+                                    float              beta,
+                                    const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs(const ITensorInfo *a,
+                                             const ITensorInfo *b,
+                                             const ITensorInfo *c,
+                                             const ITensorInfo *output,
+                                             float              alpha,
+                                             float              beta,
+                                             const GEMMInfo    &gemm_info);
+    static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a,
+                                                  const ITensorInfo *b,
+                                                  const ITensorInfo *c,
+                                                  const ITensorInfo *output,
+                                                  float              alpha,
+                                                  float              beta,
+                                                  const GEMMInfo    &gemm_info);
 
 private:
     enum AuxTensorIdx

diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp
index 5620471..55d815a 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.cpp
+++ b/src/gpu/cl/operators/ClGemmConv2d.cpp

@@ -28,10 +28,12 @@
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClActivationKernel.h"
@@ -41,8 +43,6 @@
 #include "src/gpu/cl/operators/ClGemm.h"
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -53,18 +53,38 @@
 namespace opencl
 {
 ClGemmConv2d::ClGemmConv2d()
-    : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(),
-      _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+    : _weights_reshape_kernel(nullptr),
+      _im2col_kernel(nullptr),
+      _mm_gemm(nullptr),
+      _mm_gemmlowp(nullptr),
+      _col2im_kernel(nullptr),
+      _activation_kernel(nullptr),
+      _im2col_output(),
+      _weights_reshaped(),
+      _gemm_output(),
+      _skip_im2col(false),
+      _skip_col2im(false),
+      _is_quantized(false),
+      _fuse_activation(true),
+      _append_bias(false),
+      _is_prepared(false),
+      _aux_mem(AuxTensorIdx::Count)
 {
 }
 ClGemmConv2d::~ClGemmConv2d() = default;
 
-void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+void ClGemmConv2d::configure_mm(const ClCompileContext        &compile_context,
+                                const ITensorInfo             *src,
+                                ITensorInfo                   *weights,
+                                ITensorInfo                   *biases,
+                                ITensorInfo                   *dst,
                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                                int gemm_3d_depth, const ActivationLayerInfo &act_info)
+                                int                            gemm_3d_depth,
+                                const ActivationLayerInfo     &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info));
 
     const GEMMInfo &gemm_info = GEMMInfo(false,                 // is_a_reshaped
                                          false,                 // is_b_reshaped
@@ -77,18 +97,20 @@
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
                                          act_info               // activation_info
-                                        );
+    );
 
-    TensorInfo tmp_src{ *src };
-    if(_is_quantized)
+    TensorInfo tmp_src{*src};
+    if (_is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
         const QuantizationInfo input_quantization_info   = src->quantization_info();
         const QuantizationInfo weights_quantization_info = weights->quantization_info();
 
-        tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+        tmp_src.set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>();
         _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info);
@@ -97,7 +119,7 @@
         weights->set_quantization_info(weights_quantization_info);
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
@@ -108,15 +130,21 @@
         _mm_gemm = std::make_unique<ClGemm>();
         _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
         auto mm_mem_req = _mm_gemm->workspace();
-        for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+        for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
         {
             _aux_mem[cont] = mm_mem_req[cont];
         }
     }
 }
 
-Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info)
+Status ClGemmConv2d::validate_mm(const ITensorInfo             *src,
+                                 const ITensorInfo             *weights,
+                                 const ITensorInfo             *biases,
+                                 const ITensorInfo             *dst,
+                                 const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                                 int                            gemm_3d_depth,
+                                 bool                           skip_im2col,
+                                 const ActivationLayerInfo     &act_info)
 {
     const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type());
 
@@ -131,9 +159,9 @@
                                          false,                 // fp_mixed_precision
                                          true,                  // broadcast_bias
                                          act_info               // activation_info
-                                        );
+    );
 
-    if(is_quantized)
+    if (is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
         // Extract and negate input and weights offset
@@ -142,8 +170,10 @@
 
         std::unique_ptr<ITensorInfo> src_qa     = src->clone();
         std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
-        src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
-        weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
+        src_qa->set_quantization_info(
+            QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset));
+        weights_qa->set_quantization_info(
+            QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset));
 
         // Perform validation step on GEMMLowp
         return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info);
@@ -155,14 +185,17 @@
     }
 }
 
-void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                             const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info)
+void ClGemmConv2d::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *src,
+                             ITensorInfo            *weights,
+                             ITensorInfo            *biases,
+                             ITensorInfo            *dst,
+                             const Conv2dInfo       &conv2d_info,
+                             const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
 
-    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst,
-                                                      conv2d_info,
-                                                      weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info);
 
     const DataType   data_type   = src->data_type();
@@ -180,7 +213,8 @@
 
     _is_prepared  = weights_info.retain_internal_weights();
     _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
-    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    _skip_im2col  = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                    conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
     _skip_col2im  = data_layout == DataLayout::NHWC;
 
     // Only for quantize there are few cases where we cannot fuse the activation function in GEMM
@@ -197,12 +231,8 @@
     // Get convolved dimensions
     unsigned int conv_w      = 0;
     unsigned int conv_h      = 0;
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
 
     unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
 
@@ -210,28 +240,31 @@
     _append_bias               = false;
 
     _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>();
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
     {
         // num_groups != 1 can only be for NCHW
         // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
         biases_to_use = nullptr;
         _append_bias  = true;
-        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups);
+        _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped,
+                                           conv2d_info.num_groups);
     }
     else
     {
-        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups);
+        _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped,
+                                           conv2d_info.num_groups);
     }
 
     // Create tensor to store im2col reshaped inputs
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
         // Configure and tune im2col. im2col output shape is auto-initialized
         _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>();
 
         // Set the GPU target for im2col
         _im2col_kernel->set_target(CLScheduler::get().target());
-        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
+        _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height),
+                                  conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups);
 
         // Set quantization info
         _im2col_output.set_quantization_info(src->quantization_info());
@@ -242,7 +275,7 @@
     }
 
     // Create GEMM output tensor
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -263,7 +296,7 @@
     gemmlowp_output_stage.gemmlowp_offset = 0;
 
     // Configure output stage for quantized case
-    if(_is_quantized)
+    if (_is_quantized)
     {
         const auto         output_quant_info        = (dst->total_size() == 0) ? iq_info : oq_info;
         const bool         is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
@@ -286,16 +319,16 @@
         auto min_activation = min_val.get<int32_t>();
         auto max_activation = max_val.get<int32_t>();
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-        if(conv2d_info.act_info.enabled())
+        if (conv2d_info.act_info.enabled())
         {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
             {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
             }
             else
             {
@@ -313,48 +346,60 @@
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
+    configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use,
+                 gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info);
 
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         // Set the GPU target for col2im
         _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>();
         _col2im_kernel->set_target(CLScheduler::get().target());
         // Configure and tune Col2Im
-        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups);
+        _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h),
+                                  conv2d_info.num_groups);
         CLScheduler::get().tune_kernel_static(*_col2im_kernel.get());
     }
 
     ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
                              "Output shape does not match the expected one");
 
-    if(!_fuse_activation)
+    if (!_fuse_activation)
     {
         _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>();
         _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info);
     }
 
-    _aux_mem[Im2ColOutput]    = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
-    _aux_mem[GemmOutput]      = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+    _aux_mem[Im2ColOutput] =
+        MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+    _aux_mem[WeightsReshaped] =
+        MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size());
+    _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
 }
 
-Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info,
+Status ClGemmConv2d::validate(const ITensorInfo *src,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *dst,
+                              const Conv2dInfo  &conv2d_info,
                               const WeightsInfo &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type());
 
-    if(!is_quantized_per_channel)
+    if (!is_quantized_per_channel)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights);
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8");
-    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW));
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8),
+                                    "Grouping (num_groups != 1) is not supported with QASYMM8");
+    ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) &&
+                                (src->data_layout() == DataLayout::NCHW));
 
     const DataLayout data_layout = src->data_layout();
     const DataType   data_type   = src->data_type();
@@ -374,18 +419,19 @@
     const ITensorInfo *gemm_output_to_use = dst;
     const ITensorInfo *weights_to_use     = weights;
     const bool         is_quantized       = is_data_type_quantized_asymmetric(data_type);
-    const bool         skip_im2col        = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1
-                                             && conv2d_info.conv_info.stride().second == 1);
-    const bool         skip_col2im        = data_layout == DataLayout::NHWC;
-    bool               fuse_activation    = true;
+    const bool         skip_im2col     = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 &&
+                              conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1);
+    const bool         skip_col2im     = data_layout == DataLayout::NHWC;
+    bool               fuse_activation = true;
 
-    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel));
+    ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) !=
+                                src->dimension(idx_channel));
     ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
-        if(is_quantized)
+        if (is_quantized)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
@@ -397,7 +443,7 @@
         ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
     }
 
-    if(conv2d_info.act_info.enabled())
+    if (conv2d_info.act_info.enabled())
     {
         ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a());
     }
@@ -406,48 +452,50 @@
     unsigned int conv_w = 0;
     unsigned int conv_h = 0;
 
-    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
-                                                 src->dimension(idx_height),
-                                                 kernel_width,
-                                                 kernel_height,
-                                                 conv2d_info.conv_info,
-                                                 conv2d_info.dilation);
+    std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width,
+                                                 kernel_height, conv2d_info.conv_info, conv2d_info.dilation);
 
     unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups;
 
     const ITensorInfo *biases_to_use = biases;
     bool               append_bias   = false;
 
-    if(conv2d_info.num_groups != 1 && biases != nullptr)
+    if (conv2d_info.num_groups != 1 && biases != nullptr)
     {
         // num_groups != 1 can only be for NCHW
         // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor
-        biases_to_use         = nullptr;
-        append_bias           = true;
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
+        biases_to_use = nullptr;
+        append_bias   = true;
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type);
     }
     else
     {
-        weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
+        weights_reshaped_info =
+            TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type);
     }
 
     weights_to_use = &weights_reshaped_info;
 
-    if(!skip_im2col)
+    if (!skip_im2col)
     {
         const Size2D kernel_dims(kernel_width, kernel_height);
 
         // Output tensor auto initialization if not yet initialized
-        TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups);
+        TensorShape expected_output_shape =
+            compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation,
+                                      conv2d_info.num_groups == 1, conv2d_info.num_groups);
 
         auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info,
+                                                      append_bias, conv2d_info.dilation, conv2d_info.num_groups));
         gemm_input_to_use = &im2col_reshaped_info;
     }
 
     // Create GEMM output tensor
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
         TensorShape shape_gemm;
 
@@ -465,7 +513,7 @@
     gemmlowp_output_stage.gemmlowp_offset          = 0;
     gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel;
 
-    if(is_quantized)
+    if (is_quantized)
     {
         const UniformQuantizationInfo iq_info           = src->quantization_info().uniform();
         const UniformQuantizationInfo oq_info           = dst->quantization_info().uniform();
@@ -483,16 +531,16 @@
         int min_activation = 0;
         int max_activation = 0;
 
-        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                                   ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                                 };
+        const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
+            ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+            ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU};
 
-        if(conv2d_info.act_info.enabled())
+        if (conv2d_info.act_info.enabled())
         {
-            if(supported_acts.count(conv2d_info.act_info.activation()) != 0)
+            if (supported_acts.count(conv2d_info.act_info.activation()) != 0)
             {
-                std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
+                std::tie(min_activation, max_activation) =
+                    get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info);
             }
             else
             {
@@ -509,16 +557,18 @@
     // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use,
+                                            gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info));
 
     // Validate Col2Im
-    if(!skip_col2im)
+    if (!skip_col2im)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups));
     }
 
     // Validate Activation Layer
-    if(!fuse_activation)
+    if (!fuse_activation)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info));
     }
@@ -541,30 +591,26 @@
     CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
 
     // Run im2col
-    if(!_skip_im2col)
+    if (!_skip_im2col)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, src },
-            { TensorType::ACL_DST, im2col_output.get() }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}};
         CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false);
         gemm_input_to_use = im2col_output.get();
     }
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
         gemm_output_to_use = gemm_output.get();
     }
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
     pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
-    if(!_append_bias)
+    if (!_append_bias)
     {
         pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases);
     }
     pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
     // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // Run gemmlowp
         _mm_gemmlowp->run(pack_mm);
@@ -576,43 +622,32 @@
     }
 
     // Reshape output matrix
-    if(!_skip_col2im)
+    if (!_skip_col2im)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, gemm_output_to_use },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}};
         CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false);
     }
 
     //Run Activation Layer if we cannot fuse in GEMM
-    if(!_fuse_activation)
+    if (!_fuse_activation)
     {
-        ITensorPack pack =
-        {
-            { TensorType::ACL_SRC, dst },
-            { TensorType::ACL_DST, dst }
-        };
+        ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}};
         CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false);
     }
 }
 
 void ClGemmConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Run weights reshaping and mark original weights tensor as unused
-        ICLTensor         *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
+        ICLTensor *weights_reshaped_p =
+            utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
         CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
         auto               weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack        pack    =
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, weights_reshaped.get() }
-        };
+        ITensorPack        pack    = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
 
-        if(_append_bias)
+        if (_append_bias)
         {
             const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2);
             pack.add_const_tensor(TensorType::ACL_BIAS, biases);

diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h
index 8a46ee2..e8f3147 100644
--- a/src/gpu/cl/operators/ClGemmConv2d.h
+++ b/src/gpu/cl/operators/ClGemmConv2d.h

@@ -27,6 +27,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/FunctionDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -100,15 +101,24 @@
      * @param[in]  weights_info    Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights
      *                             tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info,
-                   const WeightsInfo &weights_info = WeightsInfo());
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *weights,
+                   ITensorInfo            *biases,
+                   ITensorInfo            *dst,
+                   const Conv2dInfo       &conv2d_info,
+                   const WeightsInfo      &weights_info = WeightsInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info,
+    static Status validate(const ITensorInfo *input,
+                           const ITensorInfo *weights,
+                           const ITensorInfo *biases,
+                           const ITensorInfo *output,
+                           const Conv2dInfo  &conv2d_info,
                            const WeightsInfo &weights_info = WeightsInfo());
 
     // Inherited methods overridden:
@@ -130,9 +140,14 @@
      * @param[in]      gemm_3d_depth         Depth of GEMM 3D
      * @param[in]      act_info              Activation to apply after the matrix multiplication
      */
-    void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
+    void configure_mm(const CLCompileContext        &compile_context,
+                      const ITensorInfo             *src,
+                      ITensorInfo                   *weights,
+                      ITensorInfo                   *biases,
+                      ITensorInfo                   *dst,
                       const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                      int gemm_3d_depth, const ActivationLayerInfo &act_info);
+                      int                            gemm_3d_depth,
+                      const ActivationLayerInfo     &act_info);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines
      *
      * @param[in] src                   Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32.
@@ -148,8 +163,14 @@
      *
      * @return a status
      */
-    static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
-                              int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info);
+    static Status validate_mm(const ITensorInfo             *src,
+                              const ITensorInfo             *weights,
+                              const ITensorInfo             *biases,
+                              const ITensorInfo             *dst,
+                              const GEMMLowpOutputStageInfo &gemmlowp_output_stage,
+                              int                            gemm_3d_depth,
+                              bool                           skip_im2col,
+                              const ActivationLayerInfo     &act_info);
 
     enum AuxTensorIdx
     {

diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
index 2622274..71c247d 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp

@@ -52,7 +52,7 @@
 {
 inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type)
 {
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
         case CLGEMMKernelType::RESHAPED_ONLY_RHS:
@@ -71,32 +71,41 @@
 inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run)
 {
     auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run);
-    if(bool(gemm_kernel))
+    if (bool(gemm_kernel))
     {
-        if(validate_gemm_kernel(gemm_kernel.gemm_type))
+        if (validate_gemm_kernel(gemm_kernel.gemm_type))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.",
+                                                      to_string(gemm_kernel.gemm_type).c_str());
             return gemm_kernel.gemm_type;
         }
     }
     gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.",
+                                              to_string(gemm_kernel.gemm_type).c_str());
     return gemm_kernel.gemm_type;
 }
 
 // Validate lhs_info and rhs_info for native kernel
-inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info,
+                                         const GEMMRHSMatrixInfo &rhs_info,
+                                         const ITensorInfo       *a,
+                                         const ITensorInfo       *b,
+                                         const GEMMReshapeInfo   &reshape_info)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo mm_result_s32_info{};
     // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
+    auto_init_if_empty(
+        mm_result_s32_info,
+        a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32));
     // Validate mm kernel
     // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info
     // NOTE: This assumes:
     //  1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments).
     //  2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window).
-    if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info,
+                                                             reshape_info)))
     {
         return false;
     }
@@ -104,31 +113,45 @@
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for native kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query,
+                                                                               const ITensorInfo           *a,
+                                                                               const ITensorInfo           *b,
+                                                                               const GEMMReshapeInfo &reshape_info)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_native(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
+        if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_native(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ",
+                                              to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
-                                                    unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info,
+                                                    const GEMMRHSMatrixInfo &rhs_info,
+                                                    const ITensorInfo       *a,
+                                                    const ITensorInfo       *b,
+                                                    const ITensorInfo       *output,
+                                                    unsigned int             m,
+                                                    unsigned int             n,
+                                                    unsigned int             k,
+                                                    bool                     reinterpret_input_as_3d,
+                                                    int                      depth_output_gemm3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -148,7 +171,8 @@
     // Since we ignore the output stage, output data type has to be S32 to pass the validation
     TensorInfo output_info_copy(*output);
     output_info_copy.set_data_type(DataType::S32);
-    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                      gemm_kernel_info)))
     {
         return false;
     }
@@ -156,14 +180,22 @@
 }
 
 // Validate lhs_info and rhs_info for reshaped only rhs kernel
-inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output,
-                                                         unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d)
+inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info,
+                                                         const GEMMRHSMatrixInfo &rhs_info,
+                                                         const ITensorInfo       *a,
+                                                         const ITensorInfo       *b,
+                                                         const ITensorInfo       *output,
+                                                         unsigned int             m,
+                                                         unsigned int             n,
+                                                         unsigned int             k,
+                                                         bool                     reinterpret_input_as_3d,
+                                                         int                      depth_output_gemm3d)
 {
     // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel
     TensorInfo tmp_b_info{};
     // Validate reshape RHS kernel
     auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info)));
-    if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
+    if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)))
     {
         return false;
     }
@@ -183,7 +215,8 @@
     // Since we ignore the output stage, output data type has to be S32 to pass the validation
     TensorInfo output_info_copy(*output);
     output_info_copy.set_data_type(DataType::S32);
-    if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info)))
+    if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy,
+                                                                          gemm_kernel_info)))
     {
         return false;
     }
@@ -191,40 +224,55 @@
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
-                                                                                          const ITensorInfo *a,
-                                                                                          const ITensorInfo *b, const ITensorInfo *output)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query,
+                                          bool                         reinterpret_input_as_3d,
+                                          int                          depth_output_gemm3d,
+                                          const ITensorInfo           *a,
+                                          const ITensorInfo           *b,
+                                          const ITensorInfo           *output)
 {
     auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query);
-    if(config)
+    if (config)
     {
-        if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d))
+        if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                    query.k, reinterpret_input_as_3d, depth_output_gemm3d))
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-            return { config.lhs_info, config.rhs_info };
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ",
+                to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+            return {config.lhs_info, config.rhs_info};
         }
     }
     config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs
-std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d,
-                                                                                               const ITensorInfo *a,
-                                                                                               const ITensorInfo *b, const ITensorInfo *output)
+std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo>
+auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query,
+                                               bool                         reinterpret_input_as_3d,
+                                               int                          depth_output_gemm3d,
+                                               const ITensorInfo           *a,
+                                               const ITensorInfo           *b,
+                                               const ITensorInfo           *output)
 {
     ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d);
     auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query);
-    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d);
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(),
-                                              to_string(config.rhs_info).c_str());
-    return { config.lhs_info, config.rhs_info };
+    validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n,
+                                                 query.k, reinterpret_input_as_3d, depth_output_gemm3d);
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+        "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ",
+        to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str());
+    return {config.lhs_info, config.rhs_info};
 }
 
 inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type)
 {
-    switch(kernel_type)
+    switch (kernel_type)
     {
         case CLGEMMKernelType::NATIVE:
             return false;
@@ -254,8 +302,11 @@
 ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default;
 
 void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
-                                             ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output,
-                                             const GEMMInfo &gemm_info)
+                                             ITensorInfo            *a,
+                                             ITensorInfo            *b,
+                                             ITensorInfo            *c,
+                                             ITensorInfo            *output,
+                                             const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info));
@@ -263,8 +314,8 @@
 
     _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run();
     _a_offset                    = a->quantization_info().uniform().offset;
-    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                                   && a->data_type() == DataType::QASYMM8;
+    _convert_to_qasymm8          = is_data_type_quantized_per_channel(b->data_type()) &&
+                          is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8;
     _b_offset  = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset;
     _gemm_info = gemm_info;
 
@@ -282,17 +333,18 @@
     // Arguments used by GEMMReshapeInfo
     // in order to know how the matrices have been reshaped
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
 
     const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run);
+    _gemm_kernel_type = auto_select_gemm_kernel(
+        auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run);
 
-    if(_convert_to_qasymm8)
+    if (_convert_to_qasymm8)
     {
         // Set data type for converted weights
         _qasymm8_weights = *b;
@@ -301,47 +353,50 @@
     }
 
     ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b;
-    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
     {
         matrix_b = &_tmp_b;
 
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
-                                                                                 depth_output_gemm3d,
-                                                                                 a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
     }
-    if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+    if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
     {
         matrix_b = &_tmp_b;
 
         // Pick up the GEMM configuration
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d,
-                                                                                      depth_output_gemm3d,
-                                                                                      a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
+        std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d,
+            depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output);
 
         // Configure reshape RHS kernel
-        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info);
+        _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b,
+                                         rhs_info);
     }
 
     // Using default reduction info
-    const GEMMLowpReductionKernelInfo reduction_info {};
+    const GEMMLowpReductionKernelInfo reduction_info{};
 
     // Initialize matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0)
+    if (_a_offset != 0)
     {
         _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info);
+        _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b,
+                                           &_vector_sum_col, reduction_info);
     }
 
     // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
+    if (_b_offset != 0)
     {
         _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
@@ -360,17 +415,19 @@
     gemm_kernel_info.a_offset                = _a_offset;
     gemm_kernel_info.b_offset                = _b_offset;
     // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
         // Configure offset contribution kernel
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
 
         _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
         _gemm_output_stage_shifts      = TensorInfo(TensorShape(num_filters), 1, DataType::S32);
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
-        if(num_filters == 1)
+        if (num_filters == 1)
         {
             // Per-channel quantization with OFM == 1 is equivalent to uniform quantization.
             // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts
@@ -379,55 +436,67 @@
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
 
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS &&
+            gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL &&
+                 gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
             // Configure and tune matrix multiply kernel with fused output stage
-            _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
-                                                         _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+            _mm_reshaped_only_rhs_mmul_kernel->configure(
+                compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
         }
         else
         {
             _run_output_stage = true;
 
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
             {
-                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                        gemm_kernel_info);
             }
-            if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+            if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
             {
-                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info);
+                _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32,
+                                                             gemm_kernel_info);
             }
             else
             {
                 // Pick up the GEMM configuration
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                              a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
+                std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                    _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info);
 
                 // Configure matrix multiply kernel
-                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info);
+                _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info,
+                                             reshape_info);
 
-                _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                                                    c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage,
-                                                                    &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts);
+                _offset_contribution_output_stage_kernel->configure(
+                    compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                    _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0),
+                    _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers,
+                    &_gemm_output_stage_shifts);
             }
         }
     }
     else
     {
         _run_offset_contribution = true;
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             // Configure and tune matrix multiply kernel
             _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info);
@@ -436,44 +505,65 @@
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size },
-                                                                          a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
+            std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a,
+                _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info);
 
             // Configure matrix multiply kernel
             _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info);
         }
 
         // Configure offset contribution kernel
-        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row,
-                                               c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset);
+        _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col,
+                                               _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr,
+                                               a->dimension(0), _a_offset, _b_offset);
     }
 
     // Request memory
-    _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    _aux_mem[RhsQAsymm8] =
+        MemoryInfo(offset_int_vec(RhsQAsymm8),
+                   _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                   _qasymm8_weights.total_size());
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation
-        _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size());
-        _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
+        _aux_mem[RhsQAsymm8] =
+            MemoryInfo(offset_int_vec(RhsQAsymm8),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary,
+                       _qasymm8_weights.total_size());
+        _aux_mem[RhsReshape] = MemoryInfo(
+            offset_int_vec(RhsReshape),
+            _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size());
     }
-    if(_a_offset != 0)
+    if (_a_offset != 0)
     {
-        _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size());
+        _aux_mem[VecSumCol] =
+            MemoryInfo(offset_int_vec(VecSumCol),
+                       _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary,
+                       _vector_sum_col.total_size());
     }
-    if(_b_offset != 0)
+    if (_b_offset != 0)
     {
-        _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
+        _aux_mem[VecSumRow] =
+            MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size());
     }
-    _aux_mem[ResultS32]   = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
-    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size());
-    _aux_mem[Shifts]      = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
+    _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size());
+    _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent,
+                                       _gemm_output_stage_multipliers.total_size());
+    _aux_mem[Shifts] =
+        MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size());
 }
 
-Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
@@ -492,39 +582,44 @@
     const GPUTarget gpu_target = CLScheduler::get().target();
 
     bool               reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d();
-    const unsigned int m                       = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
-    const unsigned int n                       = b->dimension(0);
-    const unsigned int k                       = a->dimension(0);
-    const unsigned int batch_size              = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
-    const int          depth_output_gemm3d     = gemm_info.depth_output_gemm3d();
+    const unsigned int m          = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1);
+    const unsigned int n          = b->dimension(0);
+    const unsigned int k          = a->dimension(0);
+    const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2);
+    const int          depth_output_gemm3d = gemm_info.depth_output_gemm3d();
 
-    bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run()));
+    bool reshape_matrix_b = is_gemm_reshaped(
+        auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size},
+                                gemm_info.reshape_b_only_on_first_run()));
 
     const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d);
 
-    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type())
-                              && is_data_type_quantized_asymmetric(a->data_type());
+    bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) &&
+                              is_data_type_quantized_symmetric(b->data_type()) &&
+                              is_data_type_quantized_asymmetric(a->data_type());
     TensorInfo weights_info(*b);
-    if(convert_to_qasymm8)
+    if (convert_to_qasymm8)
     {
         b_offset = -128;
         weights_info.set_data_type(DataType::QASYMM8);
         ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP));
     }
     const ITensorInfo *matrix_b_info = &weights_info;
-    if(reshape_matrix_b)
+    if (reshape_matrix_b)
     {
         matrix_b_info = &tmp_b_info;
 
         // Pick up the GEMM configuration
         // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
         // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-        const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-        lhs_info       = res.lhs_info;
-        rhs_info       = res.rhs_info;
+        const auto res = select_default_gemm_config_reshaped_only_rhs(
+            auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+        lhs_info = res.lhs_info;
+        rhs_info = res.rhs_info;
 
         // Validate reshape RHS kernel
-        auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
+        auto_init_if_empty(tmp_b_info,
+                           weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info)));
         ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info));
     }
 
@@ -533,21 +628,23 @@
 
     const GEMMLowpReductionKernelInfo reduction_info;
     // Validate matrix B reduction kernel only if _a_offset is not equal to 0
-    if(a_offset != 0)
+    if (a_offset != 0)
     {
         info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32);
 
         // Configure Matrix B reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info));
     }
 
     // Validate Matrix A reduction kernel only if _b_offset is not equal to 0
-    if(b_offset != 0)
+    if (b_offset != 0)
     {
         info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32);
 
         // Configure matrix A reduction kernel
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info));
     }
 
     GEMMKernelInfo gemm_kernel_info;
@@ -560,92 +657,99 @@
     gemm_kernel_info.rhs_info                = rhs_info;
     gemm_kernel_info.a_offset                = a_offset;
     gemm_kernel_info.b_offset                = b_offset;
-    if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
+    if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE)
     {
-        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+        const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                       ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                       : 1;
 
-        const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
+        const TensorInfo gemm_output_stage_multipliers_shifts_info(
+            TensorInfo(TensorShape(num_filters), 1, DataType::S32));
 
         GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage();
         gemmlowp_output_stage.output_data_type        = a->data_type();
 
         gemm_kernel_info.output_stage = gemmlowp_output_stage;
-        if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
+        if (reshape_matrix_b &&
+            gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info,
+                &gemm_output_stage_multipliers_shifts_info));
         }
         else
         {
             TensorInfo mm_result_s32_info{};
 
-            if(reshape_matrix_b)
+            if (reshape_matrix_b)
             {
                 // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32));
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, reshape_info))
+                                                           .set_data_type(DataType::S32));
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info));
             }
             else
             {
                 // Output tensor auto inizialitation if not yet initialized
-                auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32));
+                auto_init_if_empty(mm_result_s32_info, a->clone()
+                                                           ->set_tensor_shape(compute_mm_shape(
+                                                               *matrix_a_info, *matrix_b_info, false, reshape_info))
+                                                           .set_data_type(DataType::S32));
 
                 // Pick up the GEMM configuration
                 // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails
                 // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-                const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-                lhs_info       = res.lhs_info;
-                rhs_info       = res.rhs_info;
+                const auto res = select_default_gemm_config_native(
+                    auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+                lhs_info = res.lhs_info;
+                rhs_info = res.rhs_info;
 
                 // Validate matrix multiply
-                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
+                ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                    matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info));
             }
 
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info,
-                                                                                                a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                                b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                                c,
-                                                                                                output,
-                                                                                                a_offset, b_offset,
-                                                                                                gemmlowp_output_stage,
-                                                                                                &gemm_output_stage_multipliers_shifts_info,
-                                                                                                &gemm_output_stage_multipliers_shifts_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(
+                &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col,
+                b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage,
+                &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info));
         }
     }
     else
     {
-        if(reshape_matrix_b)
+        if (reshape_matrix_b)
         {
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(
+                matrix_a_info, matrix_b_info, output, gemm_kernel_info));
         }
         else
         {
             // Pick up the GEMM configuration
             // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration
-            const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size });
-            lhs_info       = res.lhs_info;
-            rhs_info       = res.rhs_info;
+            const auto res = select_default_gemm_config_native(
+                auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size});
+            lhs_info = res.lhs_info;
+            rhs_info = res.rhs_info;
 
             // Validate matrix multiply
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(
+                matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info));
         }
 
-        if(output->total_size() != 0)
+        if (output->total_size() != 0)
         {
             // Validate offset contribution kernel
-            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output,
-                                                                                     a_offset == 0 ? nullptr : &info_vector_sum_col,
-                                                                                     b_offset == 0 ? nullptr : &info_vector_sum_row,
-                                                                                     c,
-                                                                                     a_offset, b_offset));
+            ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(
+                output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row,
+                c, a_offset, b_offset));
         }
     }
 
@@ -675,73 +779,61 @@
     const ITensor *matrix_a = a;
     const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b;
 
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         matrix_b = tmp_b.get();
-        if(!_reshape_b_only_on_first_run)
+        if (!_reshape_b_only_on_first_run)
         {
             // Run reshape matrix B
-            ITensorPack mtx_b_reshape_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
+            ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                              {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false);
         }
     }
 
     // Run matrix B reduction kernel only if _a_offset is not equal to 0
-    if(_a_offset != 0 && !_reshape_b_only_on_first_run)
+    if (_a_offset != 0 && !_reshape_b_only_on_first_run)
     {
-        ITensorPack mtx_b_red_pack =
-        {
-            { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-            { TensorType::ACL_DST, vec_sum_col.get() }
-        };
+        ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
     }
 
     // Run matrix A reduction kernel only if _b_offset is not equal to 0
-    if(_b_offset != 0)
+    if (_b_offset != 0)
     {
-        ITensorPack mtx_a_red_pack =
-        {
-            { TensorType::ACL_SRC, matrix_a },
-            { TensorType::ACL_DST, vec_sum_row.get() }
-        };
+        ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}};
         CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false);
     }
 
     // Run matrix multiply
-    if(is_gemm_reshaped(_gemm_kernel_type))
+    if (is_gemm_reshaped(_gemm_kernel_type))
     {
         ITensorPack gemm_reshaped_pack;
-        if(_run_offset_contribution)
+        if (_run_offset_contribution)
         {
-            gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst }
-            });
+            gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a},
+                                              {TensorType::ACL_SRC_1, matrix_b},
+                                              {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}});
         }
         else
         {
-            gemm_reshaped_pack = ITensorPack(
-            {
-                { TensorType::ACL_SRC, matrix_a },
-                { TensorType::ACL_SRC_1, matrix_b },
-                { TensorType::ACL_BIAS, c },
-                { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-                { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-                { TensorType::ACL_SHIFTS, shifts.get() },
-                { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-                { TensorType::ACL_DST, dst },
+            gemm_reshaped_pack = ITensorPack({
+                {TensorType::ACL_SRC, matrix_a},
+                {TensorType::ACL_SRC_1, matrix_b},
+                {TensorType::ACL_BIAS, c},
+                {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+                {TensorType::ACL_SHIFTS, shifts.get()},
+                {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+                {TensorType::ACL_DST, dst},
             });
         }
-        if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
+        if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false);
         }
-        else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
+        else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL)
         {
             CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false);
         }
@@ -752,46 +844,39 @@
     }
     else
     {
-        ITensorPack gemm_native_pack =
-        {
-            { TensorType::ACL_SRC_0, matrix_a },
-            { TensorType::ACL_SRC_1, matrix_b },
-            { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() }
-        };
+        ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a},
+                                        {TensorType::ACL_SRC_1, matrix_b},
+                                        {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}};
         CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false);
     }
-    if(_run_output_stage)
+    if (_run_output_stage)
     {
         // Run offset contribution/output stage kernel
-        ITensorPack output_stage_pack =
-        {
-            { TensorType::ACL_SRC, res32.get() },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() },
-            { TensorType::ACL_SHIFTS, shifts.get() },
-            { TensorType::ACL_MULTIPLIERS, multipliers.get() },
-            { TensorType::ACL_DST, dst },
+        ITensorPack output_stage_pack = {
+            {TensorType::ACL_SRC, res32.get()},
+            {TensorType::ACL_BIAS, c},
+            {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+            {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()},
+            {TensorType::ACL_SHIFTS, shifts.get()},
+            {TensorType::ACL_MULTIPLIERS, multipliers.get()},
+            {TensorType::ACL_DST, dst},
         };
         CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true);
     }
-    if(_run_offset_contribution)
+    if (_run_offset_contribution)
     {
         // Run offset contribution kernel
-        ITensorPack offset_contrib_pack =
-        {
-            { TensorType::ACL_SRC_DST, dst },
-            { TensorType::ACL_BIAS, c },
-            { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() },
-            { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }
-        };
+        ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst},
+                                           {TensorType::ACL_BIAS, c},
+                                           {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()},
+                                           {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}};
         CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true);
     }
 }
 
 void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         auto               b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true);
@@ -800,56 +885,55 @@
 
         ARM_COMPUTE_ERROR_ON_NULLPTR(b);
 
-        if(_convert_to_qasymm8)
+        if (_convert_to_qasymm8)
         {
-            ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } };
+            ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}};
             CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false);
             b->mark_as_unused();
         }
 
-        if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
+        if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run)
         {
             // Run reshape kernel and mark original weights tensor as unused
-            ITensorPack mtx_b_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, tmp_b.get() }
-            };
+            ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                      {TensorType::ACL_DST, tmp_b.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false);
             b->mark_as_unused();
         }
 
         // Run matrix B reduction kernel only if _a_offset is not equal to 0
-        if(_a_offset != 0 && _reshape_b_only_on_first_run)
+        if (_a_offset != 0 && _reshape_b_only_on_first_run)
         {
-            ITensorPack mtx_b_red_pack =
-            {
-                { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b },
-                { TensorType::ACL_DST, vec_sum_col.get() }
-            };
+            ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b},
+                                          {TensorType::ACL_DST, vec_sum_col.get()}};
             CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false);
         }
 
         // Compute GEMM output multipliers and shifts for output stage
         {
-            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1;
+            const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel)
+                                           ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size()
+                                           : 1;
 
             CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false);
             CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false);
 
             ICLTensor *multiplier_tensor = multipliers.get();
-            if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
+            if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0)
             {
                 multiplier_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t));
+                std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(),
+                            num_filters * sizeof(int32_t));
                 multiplier_tensor->unmap(CLScheduler::get().queue());
             }
 
             ICLTensor *shifts_tensor = shifts.get();
-            if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
+            if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0)
             {
                 shifts_tensor->map(CLScheduler::get().queue(), true);
-                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
+                std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)),
+                            _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t));
                 shifts_tensor->unmap(CLScheduler::get().queue());
             }
         }

diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
index 6e32a90..c80dc3a 100644
--- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h
+++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h

@@ -93,18 +93,27 @@
      * @param[in]  gemm_info       (Optional) Specifies if the matrix A and/or matrix B have been reshaped and
      *                       if the reshape of matrix B should be executed only for the first run
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    void configure(const CLCompileContext &compile_context,
+                   ITensorInfo            *a,
+                   ITensorInfo            *b,
+                   ITensorInfo            *c,
+                   ITensorInfo            *output,
+                   const GEMMInfo         &gemm_info = GEMMInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmLowpMatrixMultiplyCore::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo());
+    static Status validate(const ITensorInfo *a,
+                           const ITensorInfo *b,
+                           const ITensorInfo *c,
+                           const ITensorInfo *output,
+                           const GEMMInfo    &gemm_info = GEMMInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -130,7 +139,7 @@
     std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel>                  _mtx_a_reduction_kernel;
     std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel>                  _mtx_b_reduction_kernel;
     std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel>                _offset_contribution_kernel;
-    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel>     _offset_contribution_output_stage_kernel;
+    std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel;
 
     // Temporary tensors
     TensorInfo _qasymm8_weights{};
@@ -141,13 +150,13 @@
     TensorInfo _gemm_output_stage_multipliers{};
     TensorInfo _gemm_output_stage_shifts{};
 
-    int32_t          _a_offset{ 0 };
-    int32_t          _b_offset{ 0 };
-    bool             _reshape_b_only_on_first_run{ false };
-    bool             _run_output_stage{ false };
-    bool             _convert_to_qasymm8{ false };
-    bool             _run_offset_contribution{ false };
-    bool             _is_prepared{ false };
+    int32_t          _a_offset{0};
+    int32_t          _b_offset{0};
+    bool             _reshape_b_only_on_first_run{false};
+    bool             _run_output_stage{false};
+    bool             _convert_to_qasymm8{false};
+    bool             _run_offset_contribution{false};
+    bool             _is_prepared{false};
     GEMMInfo         _gemm_info{};
     CLGEMMKernelType _gemm_kernel_type{};
 

diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
index a61b11a..e3363e3 100644
--- a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp

@@ -27,22 +27,25 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h"
 #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+void ClGemmLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ITensorInfo             *src,
+                                      const ITensorInfo             *bias,
+                                      ITensorInfo                   *dst,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
         {
@@ -70,12 +73,16 @@
     }
 }
 
-Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info)
+Status ClGemmLowpOutputStage::validate(const ITensorInfo             *src,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *dst,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM16);
 
-    switch(info.type)
+    switch (info.type)
     {
         case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT:
             return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info);
@@ -94,7 +101,7 @@
     const ITensor *bias = tensors.get_const_tensor(ACL_BIAS);
     ITensor       *dst  = tensors.get_tensor(ACL_DST);
 
-    ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } };
+    ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}};
     CLScheduler::get().enqueue_op(*_kernel, pack, true);
 }
 } // namespace opencl

diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h
index 3f1b04d..6357e02 100644
--- a/src/gpu/cl/operators/ClGemmLowpOutputStage.h
+++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h

@@ -71,14 +71,21 @@
      * @param[out] dst             Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED
      * @param[in]  info            GEMMLowp output stage metadata.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    void configure(const CLCompileContext        &compile_context,
+                   const ITensorInfo             *src,
+                   const ITensorInfo             *bias,
+                   ITensorInfo                   *dst,
+                   const GEMMLowpOutputStageInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClGemmLowpOutputStage::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info);
+    static Status validate(const ITensorInfo             *src,
+                           const ITensorInfo             *bias,
+                           const ITensorInfo             *dst,
+                           const GEMMLowpOutputStageInfo &info);
 
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;

diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp
index b900974..777fc9e 100644
--- a/src/gpu/cl/operators/ClIndirectConv2d.cpp
+++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp

@@ -27,15 +27,14 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
-#include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
-#include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h"
-#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
-
-#include "src/core/helpers/MemoryHelpers.h"
-#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h"
+#include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h"
+#include "src/gpu/cl/utils/ClAuxTensorHandler.h"
+#include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h"
+#include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h"
 
 using namespace arm_compute::cl_indirect_conv;
 
@@ -47,7 +46,8 @@
 
 namespace
 {
-DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo
+config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info)
 {
     // Get GPU target
     GPUTarget gpu_target = CLScheduler::get().target();
@@ -59,8 +59,13 @@
 
 } // namespace
 
-void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void ClIndirectConv2d::configure(const CLCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info);
@@ -86,25 +91,29 @@
     CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel);
 
     // Request memory for the indirect buffer
-    _aux_mem[IndirectBuffer] = MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
+    _aux_mem[IndirectBuffer] =
+        MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size());
 }
 
-Status ClIndirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                  const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+Status ClIndirectConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info)
 {
     // Initialize the direct convolution descriptor
     const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info);
 
-    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(),
-                                                                                         src->data_layout(),
-                                                                                         weights->tensor_shape(),
-                                                                                         conv_info,
-                                                                                         desc);
+    TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(
+        src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc);
 
     TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(src, weights, &indirect_buffer, conv_info, desc));
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, conv_info, act_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(
+        src, weights, &indirect_buffer, conv_info, desc));
+    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst,
+                                                                          conv_info, act_info, desc));
 
     return Status{};
 }
@@ -124,9 +133,10 @@
 
 void ClIndirectConv2d::prepare(ITensorPack &constants)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        ICLTensor *indirect_buffer_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
+        ICLTensor *indirect_buffer_aux =
+            utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer)));
         ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr);
 
         ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer");
@@ -134,7 +144,7 @@
         CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux);
         ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr);
 
-        ITensorPack indirect_buffer_pack{ { ACL_DST, indirect_buffer.get() } };
+        ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}};
         CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true);
 
         _is_prepared = true;

diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h
index e50fa25..29e796e 100644
--- a/src/gpu/cl/operators/ClIndirectConv2d.h
+++ b/src/gpu/cl/operators/ClIndirectConv2d.h

@@ -77,7 +77,12 @@
      * @param[in]  act_info        (Optional) Activation layer information in case of a fused activation.
      *
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -85,12 +90,16 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -100,11 +109,11 @@
         Count
     };
 
-    std::unique_ptr<IClKernel>       _indirect_conv_kernel{ nullptr };
-    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{ nullptr };
+    std::unique_ptr<IClKernel>       _indirect_conv_kernel{nullptr};
+    std::unique_ptr<IClKernel>       _addr_precalculation_kernel{nullptr};
     TensorInfo                       _indirect_buffer{};
-    bool                             _is_prepared{ false };
-    experimental::MemoryRequirements _aux_mem{ Count };
+    bool                             _is_prepared{false};
+    experimental::MemoryRequirements _aux_mem{Count};
 };
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp
index b2eb89b..d8d4186 100644
--- a/src/gpu/cl/operators/ClLogicalNot.cpp
+++ b/src/gpu/cl/operators/ClLogicalNot.cpp

@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClLogicalNot.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl

diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp
index 49d1412..c14b1f2 100644
--- a/src/gpu/cl/operators/ClMatMul.cpp
+++ b/src/gpu/cl/operators/ClMatMul.cpp

@@ -47,11 +47,17 @@
 {
 }
 
-Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+Status ClMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *dst,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
@@ -61,11 +67,16 @@
 
     const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type());
 
-    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) :
-           ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+    return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info)
+                        : ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
 }
 
-void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+void ClMatMul::configure(const CLCompileContext    &compile_context,
+                         ITensorInfo               *lhs,
+                         ITensorInfo               *rhs,
+                         ITensorInfo               *dst,
+                         const MatMulInfo          &matmul_info,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst);
     ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info);
@@ -81,12 +92,13 @@
 
     MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info);
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _matmul_lowp_native_kernel->set_target(gpu_target);
 
         // Configure the low-precision native matrix multiply kernel
-        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info);
+        _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info,
+                                              act_info);
     }
     else
     {
@@ -99,7 +111,7 @@
 
 void ClMatMul::run(ITensorPack &tensors)
 {
-    if(_is_quantized)
+    if (_is_quantized)
     {
         CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true);
     }

diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h
index abbb752..64dcf21 100644
--- a/src/gpu/cl/operators/ClMatMul.h
+++ b/src/gpu/cl/operators/ClMatMul.h

@@ -26,6 +26,7 @@
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/function_info/MatMulInfo.h"
+
 #include "src/gpu/cl/IClOperator.h"
 #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h"
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
@@ -73,7 +74,11 @@
      * @param[in]  matmul_info     Contains MatMul operation information described in @ref MatMulInfo.
      * @param[in]  act_info        Class containing information about fused activation function.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info,
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *lhs,
+                   ITensorInfo               *rhs,
+                   ITensorInfo               *dst,
+                   const MatMulInfo          &matmul_info,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -81,15 +86,19 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *lhs,
+                           const ITensorInfo         *rhs,
+                           const ITensorInfo         *dst,
+                           const MatMulInfo          &matmul_info,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
     // Inherited methods overridden:
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{ nullptr };
-    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr };
+    std::unique_ptr<kernels::ClMatMulNativeKernel>     _matmul_native_kernel{nullptr};
+    std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr};
 
-    bool _is_quantized{ false };
+    bool _is_quantized{false};
 };
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp
index 2066f0c..10cf8a6 100644
--- a/src/gpu/cl/operators/ClMul.cpp
+++ b/src/gpu/cl/operators/ClMul.cpp

@@ -24,17 +24,23 @@
 #include "src/gpu/cl/operators/ClMul.h"
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClMulKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClMulKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
-void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                      ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void ClMul::configure(const CLCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      float                      scale,
+                      ConvertPolicy              overflow_policy,
+                      RoundingPolicy             rounding_policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
     auto k = std::make_unique<kernels::ClMulKernel>();
@@ -42,22 +48,34 @@
     _kernel = std::move(k);
 }
 
-Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                       ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status ClMul::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       float                      scale,
+                       ConvertPolicy              overflow_policy,
+                       RoundingPolicy             rounding_policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info)
+void ClComplexMul::configure(const CLCompileContext    &compile_context,
+                             ITensorInfo               *src1,
+                             ITensorInfo               *src2,
+                             ITensorInfo               *dst,
+                             const ActivationLayerInfo &act_info)
 {
     auto k = std::make_unique<kernels::ClComplexMulKernel>();
     k->configure(compile_context, src1, src2, dst, act_info);
     _kernel = std::move(k);
 }
 
-Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info)
+Status ClComplexMul::validate(const ITensorInfo         *src1,
+                              const ITensorInfo         *src2,
+                              const ITensorInfo         *dst,
+                              const ActivationLayerInfo &act_info)
 {
     return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h
index 6086bc9..1cf4d68 100644
--- a/src/gpu/cl/operators/ClMul.h
+++ b/src/gpu/cl/operators/ClMul.h

@@ -66,16 +66,27 @@
      * @param[in]      rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale,
-                   ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   float                      scale,
+                   ConvertPolicy              overflow_policy,
+                   RoundingPolicy             rounding_policy,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale,
-                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           float                      scale,
+                           ConvertPolicy              overflow_policy,
+                           RoundingPolicy             rounding_policy,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 
 /** Basic function to run @ref opencl::kernels::ClComplexMulKernel */
@@ -92,14 +103,21 @@
      * @param[out]     dst             The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    void configure(const CLCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to @ref ClComplexMul::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo());
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp
index cf4ebe6..f3efd00 100644
--- a/src/gpu/cl/operators/ClPRelu.cpp
+++ b/src/gpu/cl/operators/ClPRelu.cpp

@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClPRelu.h"
 
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 namespace arm_compute
 {
 namespace opencl
 {
 using KernelType = kernels::ClArithmeticKernel;
-void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output)
+void ClPRelu::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *input,
+                        ITensorInfo            *alpha,
+                        ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, alpha, output);
     auto k = std::make_unique<KernelType>();
@@ -49,7 +51,7 @@
 {
     // Output tensor can be given as nullptr for in-place computation.
     // In this case, get the input tensor and use it as the output tensor.
-    if(tensors.get_tensor(TensorType::ACL_DST) == nullptr)
+    if (tensors.get_tensor(TensorType::ACL_DST) == nullptr)
     {
         auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0));
         ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation");
@@ -58,4 +60,4 @@
     IClOperator::run(tensors);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h
index 8084ab8..45ce858 100644
--- a/src/gpu/cl/operators/ClPRelu.h
+++ b/src/gpu/cl/operators/ClPRelu.h

@@ -47,7 +47,8 @@
      * @param[in]  alpha           PRelu layer parameters. Data types supported: same of @p input.
      * @param[out] output          Destination tensor. Data type supported: same as @p input
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPRelu::configure()

diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp
index ed56f97..3851e22 100644
--- a/src/gpu/cl/operators/ClPermute.cpp
+++ b/src/gpu/cl/operators/ClPermute.cpp

@@ -23,16 +23,18 @@
  */
 #include "src/gpu/cl/operators/ClPermute.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPermuteKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm)
+void ClPermute::configure(const ClCompileContext  &compile_context,
+                          const ITensorInfo       *src,
+                          ITensorInfo             *dst,
+                          const PermutationVector &perm)
 {
     ARM_COMPUTE_LOG_PARAMS(src, dst, perm);
     auto k = std::make_unique<kernels::ClPermuteKernel>();
@@ -45,4 +47,4 @@
     return kernels::ClPermuteKernel::validate(src, dst, perm);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h
index 3e87329..6349358 100644
--- a/src/gpu/cl/operators/ClPermute.h
+++ b/src/gpu/cl/operators/ClPermute.h

@@ -44,7 +44,10 @@
      * @param[in] dst             The dst tensor info. Data types supported: Same as @p src
      * @param[in] perm            Permutation vector
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       *src,
+                   ITensorInfo             *dst,
+                   const PermutationVector &perm);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPermute::configure()
@@ -55,4 +58,4 @@
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_PERMUTE_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_PERMUTE_H */

diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp
index 3da90b8..e4507dc 100644
--- a/src/gpu/cl/operators/ClPool2d.cpp
+++ b/src/gpu/cl/operators/ClPool2d.cpp

@@ -25,16 +25,19 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool2dKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices)
+void ClPool2d::configure(const ClCompileContext &compile_context,
+                         ITensorInfo            *src,
+                         ITensorInfo            *dst,
+                         const PoolingLayerInfo &info,
+                         ITensorInfo            *indices)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices);
@@ -49,7 +52,10 @@
     CLScheduler::get().tune_kernel_static(*_kernel);
 }
 
-Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices)
+Status ClPool2d::validate(const ITensorInfo      *src,
+                          const ITensorInfo      *dst,
+                          const PoolingLayerInfo &info,
+                          const ITensorInfo      *indices)
 {
     return kernels::ClPool2dKernel::validate(src, dst, info, indices);
 }

diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h
index f353ba2..9c2fd1c 100644
--- a/src/gpu/cl/operators/ClPool2d.h
+++ b/src/gpu/cl/operators/ClPool2d.h

@@ -50,14 +50,21 @@
      * @param[in]  info            Pooling layer parameters.
      * @param[out] indices         (optional) The indices info of the maximal values. Data type supported: U32.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr);
+    void configure(const ClCompileContext &compile_context,
+                   ITensorInfo            *src,
+                   ITensorInfo            *dst,
+                   const PoolingLayerInfo &info,
+                   ITensorInfo            *indices = nullptr);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr);
+    static Status validate(const ITensorInfo      *src,
+                           const ITensorInfo      *dst,
+                           const PoolingLayerInfo &info,
+                           const ITensorInfo      *indices = nullptr);
 };
 } // namespace opencl
 } // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp
index 7dec6c5..d230413 100644
--- a/src/gpu/cl/operators/ClPool3d.cpp
+++ b/src/gpu/cl/operators/ClPool3d.cpp

@@ -25,16 +25,18 @@
 
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClPool3dKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClPool3d::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info)
+void ClPool3d::configure(const ClCompileContext   &compile_context,
+                         const ITensorInfo        *src,
+                         ITensorInfo              *dst,
+                         const Pooling3dLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);

diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h
index 7d994fd..9fd78bf 100644
--- a/src/gpu/cl/operators/ClPool3d.h
+++ b/src/gpu/cl/operators/ClPool3d.h

@@ -51,7 +51,10 @@
      * @param[out] dst             Destination tensor info.
      * @param[in]  info            3d Pooling layer parameters.
      */
-    void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info);
+    void configure(const ClCompileContext   &compile_context,
+                   const ITensorInfo        *src,
+                   ITensorInfo              *dst,
+                   const Pooling3dLayerInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClPool3d::configure()

diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp
index 47ae5ce..8560b55 100644
--- a/src/gpu/cl/operators/ClQuantize.cpp
+++ b/src/gpu/cl/operators/ClQuantize.cpp

@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/ClCompileContext.h"
-#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/ClCompileContext.h"
+#include "src/gpu/cl/kernels/ClQuantizeKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp
index 560966f..1dd5b76 100644
--- a/src/gpu/cl/operators/ClReshape.cpp
+++ b/src/gpu/cl/operators/ClReshape.cpp

@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClReshape.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClReshapeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@
     return kernels::ClReshapeKernel::validate(src, dst);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp
index 0798b19..184e2aa 100644
--- a/src/gpu/cl/operators/ClScale.cpp
+++ b/src/gpu/cl/operators/ClScale.cpp

@@ -25,17 +25,20 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info)
+void ClScale::configure(const CLCompileContext &compile_context,
+                        ITensorInfo            *src,
+                        ITensorInfo            *dst,
+                        const ScaleKernelInfo  &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src);
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
@@ -61,4 +64,4 @@
     CLScheduler::get().enqueue_op(*_kernel.get(), tensors);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h
index af97cf2..1427bb4 100644
--- a/src/gpu/cl/operators/ClScale.h
+++ b/src/gpu/cl/operators/ClScale.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SCALE_H
 
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -49,7 +50,8 @@
      *                                All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane.
      * @param[in]     info            @ref ScaleKernelInfo descriptor to be used to configure
      */
-    void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
+    void
+    configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClScale::configure()

diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp
index 0380955..2bec400 100644
--- a/src/gpu/cl/operators/ClSoftmax.cpp
+++ b/src/gpu/cl/operators/ClSoftmax.cpp

@@ -22,7 +22,10 @@
  * SOFTWARE.
  */
 #include "src/gpu/cl/operators/ClSoftmax.h"
+
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
@@ -30,8 +33,6 @@
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
 #include "support/Cast.h"
 
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute::experimental;
 
 namespace arm_compute
@@ -52,7 +53,10 @@
 {
 }
 
-void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info)
+void ClSoftmax::configure(const CLCompileContext  &compile_context,
+                          const ITensorInfo       &src,
+                          ITensorInfo             &dst,
+                          const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info));
     ARM_COMPUTE_LOG_PARAMS(src, dst, info);
@@ -64,14 +68,15 @@
     const ITensorInfo &tmp_input_info  = _needs_permute ? _permuted_src_info : src;
     ITensorInfo       &tmp_output_info = _needs_permute ? _permuted_dst_info : dst;
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info);
     }
 
-    DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
-    _tmp_info              = tmp_input_info.clone()->set_data_type(tmp_data_type);
+    DataType tmp_data_type =
+        is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type();
+    _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type);
 
     TensorShape max_sum_shape = tmp_input_info.tensor_shape();
     _max_info                 = tmp_input_info.clone()->set_tensor_shape(max_sum_shape);
@@ -83,33 +88,41 @@
     _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info);
     _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
         _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info);
     }
 
-    _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
-    _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
-    _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
+    _aux_mem[InternalTensorIdx::SUM] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size());
+    _aux_mem[InternalTensorIdx::TMP] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size());
+    _aux_mem[InternalTensorIdx::MAX] =
+        MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size());
 
-    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size());
-    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC),
+                                                           MemoryLifetime::Temporary, _permuted_src_info.total_size());
+    _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST),
+                                                           MemoryLifetime::Temporary, _permuted_dst_info.total_size());
 }
 
 Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported");
     ARM_COMPUTE_UNUSED(info.beta);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
+                                static_cast<int32_t>(src.num_dimensions()) <= info.axis);
 
-    const size_t actual_axis   = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
+    const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions())));
     const bool   needs_permute = actual_axis != 0;
-    if(needs_permute)
+    if (needs_permute)
     {
-        const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
-        const TensorShape       permuted_shape     = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
-        TensorInfo              input_permuted(src.clone()->set_tensor_shape(permuted_shape));
+        const PermutationVector permutation_vector =
+            softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis);
+        const TensorShape permuted_shape =
+            misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector);
+        TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector));
         TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape));
         ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector));
@@ -122,9 +135,14 @@
     TensorShape max_sum_shape = src.tensor_shape();
     max_sum_shape.set(0, 1);
     TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true));
-    TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true));
+    TensorInfo tensor_info_sum(src.clone()
+                                   ->set_tensor_shape(max_sum_shape)
+                                   .set_data_type(tmp_data_type)
+                                   .set_quantization_info(QuantizationInfo())
+                                   .set_is_resizable(true));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum));
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info));
 
     return Status{};
@@ -139,10 +157,12 @@
     CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false);
     CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false);
 
-    CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false);
-    CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false);
+    CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors,
+                                    false);
+    CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors,
+                                    false);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         ITensorPack pack;
         pack.add_const_tensor(TensorType::ACL_SRC, src);
@@ -152,7 +172,7 @@
 
     ITensorPack sum_pack;
     ITensorPack norm_pack;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get());
         norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get());
@@ -172,7 +192,7 @@
     CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false);
     CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         ITensorPack pack;
         pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get());
@@ -186,4 +206,4 @@
     return _aux_mem;
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h
index 6c9af58..6c2aaae 100644
--- a/src/gpu/cl/operators/ClSoftmax.h
+++ b/src/gpu/cl/operators/ClSoftmax.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SOFTMAX_H
 
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -52,7 +53,10 @@
      * @param[out] dst             Destination tensor info. Data types supported: same as @p src
      * @param[in]  info            Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo.
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info);
+    void configure(const CLCompileContext  &compile_context,
+                   const ITensorInfo       &src,
+                   ITensorInfo             &dst,
+                   const SoftmaxKernelInfo &info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClSoftmax::configure()
@@ -61,7 +65,7 @@
      */
     static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info);
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
@@ -79,7 +83,7 @@
     std::unique_ptr<ClPermute>                               _permute_output;
     std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel;
     std::unique_ptr<kernels::ClLogits1DNormKernel>           _norm_kernel;
-    bool                                                     _needs_permute{ false };
+    bool                                                     _needs_permute{false};
 
     TensorInfo _max_info;
     TensorInfo _sum_info;
@@ -90,6 +94,6 @@
     experimental::MemoryRequirements _aux_mem{};
 };
 
-} // opencl
-} // arm_compute
-#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
\ No newline at end of file
+} // namespace opencl
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CL_SOFTMAX_H */

diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp
index 53be04a..5c6d0c3 100644
--- a/src/gpu/cl/operators/ClSub.cpp
+++ b/src/gpu/cl/operators/ClSub.cpp

@@ -23,17 +23,20 @@
  */
 #include "src/gpu/cl/operators/ClSub.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
 {
-void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst,
-                      ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void ClSub::configure(const ClCompileContext    &compile_context,
+                      ITensorInfo               *src1,
+                      ITensorInfo               *src2,
+                      ITensorInfo               *dst,
+                      ConvertPolicy              policy,
+                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info);
     auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>();
@@ -41,8 +44,11 @@
     _kernel = std::move(k);
 }
 
-Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst,
-                       ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status ClSub::validate(const ITensorInfo         *src1,
+                       const ITensorInfo         *src2,
+                       const ITensorInfo         *dst,
+                       ConvertPolicy              policy,
+                       const ActivationLayerInfo &act_info)
 {
     return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info);
 }

diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h
index 7eac437..6a97275 100644
--- a/src/gpu/cl/operators/ClSub.h
+++ b/src/gpu/cl/operators/ClSub.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_SUB_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
+
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
 
@@ -65,7 +66,11 @@
      * @param[in]      policy          Policy to use to handle overflow.
      * @param[in]      act_info        (Optional) Activation layer information in case of a fused activation.
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy,
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src1,
+                   ITensorInfo               *src2,
+                   ITensorInfo               *dst,
+                   ConvertPolicy              policy,
                    const ActivationLayerInfo &act_info = ActivationLayerInfo());
     /** Static function to check if given info will lead to a valid configuration
      *
@@ -73,7 +78,10 @@
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy,
+    static Status validate(const ITensorInfo         *src1,
+                           const ITensorInfo         *src2,
+                           const ITensorInfo         *dst,
+                           ConvertPolicy              policy,
                            const ActivationLayerInfo &act_info = ActivationLayerInfo());
 };
 } // namespace opencl

diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp
index 26feffe..28da0d6 100644
--- a/src/gpu/cl/operators/ClTranspose.cpp
+++ b/src/gpu/cl/operators/ClTranspose.cpp

@@ -23,11 +23,10 @@
  */
 #include "src/gpu/cl/operators/ClTranspose.h"
 
+#include "src/common/utils/Log.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace opencl
@@ -45,4 +44,4 @@
     return kernels::ClTransposeKernel::validate(src, dst);
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp
index 90dbe7f..cec438f 100644
--- a/src/gpu/cl/operators/ClTransposedConvolution.cpp
+++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h"
 
@@ -32,8 +33,12 @@
 {
 namespace opencl
 {
-void ClTransposedConvolution::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                                        const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info)
+void ClTransposedConvolution::configure(const CLCompileContext &compile_context,
+                                        const ITensorInfo      *input,
+                                        const ITensorInfo      *weights,
+                                        const ITensorInfo      *biases,
+                                        ITensorInfo            *output,
+                                        const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info);
@@ -43,10 +48,14 @@
     _transposed_conv_kernel = std::move(kernel_object);
 }
 
-Status ClTransposedConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                                         const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status ClTransposedConvolution::validate(const ITensorInfo   *input,
+                                         const ITensorInfo   *weights,
+                                         const ITensorInfo   *biases,
+                                         const ITensorInfo   *output,
+                                         const PadStrideInfo &deconv_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info));
     return Status{};
 }
 

diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h
index 58ebc68..660c4f8 100644
--- a/src/gpu/cl/operators/ClTransposedConvolution.h
+++ b/src/gpu/cl/operators/ClTransposedConvolution.h

@@ -68,23 +68,30 @@
      * @param[in]  deconv_info     Contains padding and stride information described in @ref PadStrideInfo.
      *
      */
-    void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights,
-                   const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info);
+    void configure(const CLCompileContext &compile_context,
+                   const ITensorInfo      *input,
+                   const ITensorInfo      *weights,
+                   const ITensorInfo      *biases,
+                   ITensorInfo            *output,
+                   const PadStrideInfo    &deconv_info);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClTransposedConvolution::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases,
-                           const ITensorInfo *output, const PadStrideInfo &deconv_info);
+    static Status validate(const ITensorInfo   *input,
+                           const ITensorInfo   *weights,
+                           const ITensorInfo   *biases,
+                           const ITensorInfo   *output,
+                           const PadStrideInfo &deconv_info);
 
     // Inherited method overridden
     void run(ITensorPack &tensors) override;
 
 private:
-    std::unique_ptr<IClKernel> _transposed_conv_kernel{ nullptr };
+    std::unique_ptr<IClKernel> _transposed_conv_kernel{nullptr};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */

diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp
index b4163a5..8ec96b2 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.cpp
+++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp

@@ -24,20 +24,19 @@
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h"
 #include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h"
 #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h"
 #include "src/gpu/cl/utils/ClAuxTensorHandler.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 using namespace arm_compute::experimental;
@@ -55,15 +54,16 @@
     const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
 
     // Check if the input spatial dimensions are smaller than 4
-    const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
+    const bool is_input_lt4_nchw =
+        (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW);
 
-    if(kernel_max_dim == 3U)
+    if (kernel_max_dim == 3U)
     {
-        if(kernel_dims == Size2D(3U, 3U))
+        if (kernel_dims == Size2D(3U, 3U))
         {
             output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U);
         }
-        else if(kernel_dims == Size2D(3U, 1U))
+        else if (kernel_dims == Size2D(3U, 1U))
         {
             output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U);
         }
@@ -72,15 +72,13 @@
             output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U);
         }
     }
-    else if(kernel_max_dim == 5U)
+    else if (kernel_max_dim == 5U)
     {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U,
-                             kernel_dims.height == 1 ? 1U : 4U);
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U);
     }
-    else if(kernel_max_dim == 7U)
+    else if (kernel_max_dim == 7U)
     {
-        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U,
-                             kernel_dims.height == 1 ? 1U : 2U);
+        output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U);
     }
 
     return output_tile;
@@ -91,11 +89,9 @@
     // Check if we want to configure a Winograd configuration which requires fast math
     using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
 
-    std::vector<WinogradConfiguration> fast_math_winograd =
-    {
+    std::vector<WinogradConfiguration> fast_math_winograd = {
         WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
-        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
-    };
+        WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))};
 
     auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
                             std::pair<int, int>(kernel_size.width, kernel_size.height));
@@ -103,8 +99,13 @@
     return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
 }
 
-Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                          const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status validate_arguments(const ITensorInfo         *src,
+                          const ITensorInfo         *weights,
+                          const ITensorInfo         *biases,
+                          const ITensorInfo         *dst,
+                          const PadStrideInfo       &conv_info,
+                          const ActivationLayerInfo &act_info,
+                          bool                       enable_fast_math)
 {
     // Get indeces for the width and height
     const size_t idx_width  = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH);
@@ -115,41 +116,49 @@
     const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]);
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))),
+        "Winograd only supports padding up to half kernel size");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))),
+        "Winograd only supports padding up to half kernel size");
 
     // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(
+            src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                        "This Winograd configuration requires enable_fast_math=true");
     }
 
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
 
     // Validate input transform
-    const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
-    const TensorInfo  input0       = src->clone()->set_tensor_shape(input0_shape);
+    const TensorShape input0_shape =
+        misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info);
+    const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info));
 
     // Validate filter transform
-    const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
-    const TensorInfo  input1       = weights->clone()->set_tensor_shape(input1_shape);
+    const TensorShape input1_shape =
+        misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
+    const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info));
 
     // Validate batched matrix multiply
     TensorShape batched_mm_output_shape = input0.tensor_shape();
     batched_mm_output_shape[0]          = input1.tensor_shape()[0];
     const TensorInfo batched_mm_output  = input0.clone()->set_tensor_shape(batched_mm_output_shape);
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
-                                                                                                                     GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f,
+                         GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                  GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))));
 
     // Configure output transform
-    ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info));
     return Status{};
 }
 
@@ -171,8 +180,14 @@
 
 ClWinogradConv2d::~ClWinogradConv2d() = default;
 
-void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst,
-                                 const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math)
+void ClWinogradConv2d::configure(const ClCompileContext    &compile_context,
+                                 ITensorInfo               *src,
+                                 ITensorInfo               *weights,
+                                 ITensorInfo               *biases,
+                                 ITensorInfo               *dst,
+                                 const PadStrideInfo       &conv_info,
+                                 const ActivationLayerInfo &act_info,
+                                 bool                       enable_fast_math)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
     ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math);
@@ -187,50 +202,53 @@
     const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout());
 
     // Check if the Winograd configuration requires fast math
-    if(!enable_fast_math)
+    if (!enable_fast_math)
     {
-        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false.
-        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+        ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1,
+                                                      DataType::F32); //disable winograd for fp16 if fast math is false.
+        ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
+                                 "This Winograd configuration requires enable_fast_math=true");
     }
-    const WinogradInfo winograd_info = WinogradInfo(output_tile,
-                                                    kernel_size,
-                                                    input_dims,
-                                                    conv_info,
-                                                    src->data_layout());
+    const WinogradInfo winograd_info =
+        WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout());
 
     _is_prepared = false;
 
     // Configure input transform
     _input_transform->configure(compile_context, src, &_input0, winograd_info);
-    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue());
+    _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT,
+                              PixelValue());
 
     // Configure filter transform
     _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
 
     // Configure batched matrix multiply
-    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0,
-                                                                                                                  false, false,
-                                                                                                                  GEMMLowpOutputStageInfo(),
-                                                                                                                  (src->data_type() == DataType::F16)));
+    _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f,
+                          GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false,
+                                   GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)));
 
     // Configure output transform
     _output_transform->set_target(CLScheduler::get().target());
     _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info);
 
-    _aux_mem                             = _batched_mm.workspace();
-    const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r)
-    {
-        return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0);
-    }) ?
-    MemoryLifetime::Prepare :
-    MemoryLifetime::Persistent;
+    _aux_mem = _batched_mm.workspace();
+    const MemoryLifetime wino_wei_lifetm =
+        std::any_of(std::begin(_aux_mem), std::end(_aux_mem),
+                    [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); })
+            ? MemoryLifetime::Prepare
+            : MemoryLifetime::Persistent;
     _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size()));
     _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size()));
     _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size()));
 }
 
-Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                                  const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status ClWinogradConv2d::validate(const ITensorInfo         *src,
+                                  const ITensorInfo         *weights,
+                                  const ITensorInfo         *biases,
+                                  const ITensorInfo         *dst,
+                                  const PadStrideInfo       &conv_info,
+                                  const ActivationLayerInfo &act_info,
+                                  bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math));
     return Status{};
@@ -251,10 +269,9 @@
     prepare(tensors);
 
     // Run input transform
-    ITensorPack pack_it
-    {
-        { TensorType::ACL_SRC, src },
-        { TensorType::ACL_DST, input0.get() },
+    ITensorPack pack_it{
+        {TensorType::ACL_SRC, src},
+        {TensorType::ACL_DST, input0.get()},
     };
     CLScheduler::get().enqueue_op(_border_handler, pack_it, false);
     CLScheduler::get().enqueue_op(*_input_transform, pack_it, false);
@@ -263,31 +280,31 @@
     ITensorPack pack_mm = tensors;
     pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get());
     pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get());
-    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
+    is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1)
+                     : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get());
     _batched_mm.run(pack_mm);
 
     // Run output transform
-    ITensorPack pack_ot
-    {
-        { TensorType::ACL_SRC_0, batched_mm_output.get() },
-        { TensorType::ACL_SRC_1, biases },
-        { TensorType::ACL_DST, dst },
+    ITensorPack pack_ot{
+        {TensorType::ACL_SRC_0, batched_mm_output.get()},
+        {TensorType::ACL_SRC_1, biases},
+        {TensorType::ACL_DST, dst},
     };
     CLScheduler::get().enqueue_op(*_output_transform, pack_ot);
 }
 
 void ClWinogradConv2d::prepare(ITensorPack &tensors)
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        auto       weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
+        auto weights =
+            utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1));
         ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3)));
 
         CLAuxTensorHandler input1(_input1, *in1_aux);
-        ITensorPack        pack_ft
-        {
-            { TensorType::ACL_SRC, weights },
-            { TensorType::ACL_DST, input1.get() },
+        ITensorPack        pack_ft{
+            {TensorType::ACL_SRC, weights},
+            {TensorType::ACL_DST, input1.get()},
         };
         // Run filter transform and mark original weights as unused
         CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false);
@@ -308,4 +325,4 @@
     return _aux_mem;
 }
 } // namespace opencl
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h
index eb2f7a7..54ec1a1 100644
--- a/src/gpu/cl/operators/ClWinogradConv2d.h
+++ b/src/gpu/cl/operators/ClWinogradConv2d.h

@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_CL_WINOGRADCONV2D_H
 
 #include "arm_compute/runtime/CL/CLTensor.h"
+
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/gpu/cl/ClCompileContext.h"
 #include "src/gpu/cl/IClOperator.h"
@@ -41,7 +42,7 @@
 class ClWinogradInputTransformKernel;
 class ClWinogradFilterTransformKernel;
 class ClWinogradOutputTransformKernel;
-} // kernels
+} // namespace kernels
 /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels:
  *
  *  -# @ref kernels::ClWinogradInputTransformKernel
@@ -93,20 +94,31 @@
      * @param[in]  enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation
      *                              available which may introduce a drop of accuracy as well. Default is false
      */
-    void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info,
-                   const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    void configure(const ClCompileContext    &compile_context,
+                   ITensorInfo               *src,
+                   ITensorInfo               *weights,
+                   ITensorInfo               *biases,
+                   ITensorInfo               *dst,
+                   const PadStrideInfo       &conv_info,
+                   const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                   bool                       enable_fast_math = false);
     /** Static function to check if given info will lead to a valid configuration
      *
      * Similar to ClWinogradConv2d::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
-                           const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false);
+    static Status validate(const ITensorInfo         *src,
+                           const ITensorInfo         *weights,
+                           const ITensorInfo         *biases,
+                           const ITensorInfo         *dst,
+                           const PadStrideInfo       &conv_info,
+                           const ActivationLayerInfo &act_info         = ActivationLayerInfo(),
+                           bool                       enable_fast_math = false);
 
     // Inherited method overridden
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &tensors) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &tensors) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:

diff --git a/src/gpu/cl/utils/ClAuxTensorHandler.h b/src/gpu/cl/utils/ClAuxTensorHandler.h
index af38348..81dc3ba 100644
--- a/src/gpu/cl/utils/ClAuxTensorHandler.h
+++ b/src/gpu/cl/utils/ClAuxTensorHandler.h

@@ -39,25 +39,26 @@
 class CLAuxTensorHandler
 {
 public:
-    CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
+    CLAuxTensorHandler(
+        int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false)
         : _tensor()
     {
-        if(info.total_size() == 0)
+        if (info.total_size() == 0)
         {
             return;
         }
         _tensor.allocator()->soft_init(info);
 
         ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id));
-        if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
+        if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size()))
         {
-            if(!bypass_alloc)
+            if (!bypass_alloc)
             {
                 _tensor.allocator()->allocate();
                 ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor");
             }
 
-            if(pack_inject)
+            if (pack_inject)
             {
                 pack.add_tensor(slot_id, &_tensor);
                 _injected_tensor_pack = &pack;
@@ -70,22 +71,21 @@
         }
     }
 
-    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor)
-        : _tensor()
+    CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) : _tensor()
     {
         _tensor.allocator()->soft_init(info);
-        if(info.total_size() <= tensor.info()->total_size())
+        if (info.total_size() <= tensor.info()->total_size())
         {
             _tensor.allocator()->import_memory(tensor.cl_buffer());
         }
     }
 
-    CLAuxTensorHandler(const CLAuxTensorHandler &) = delete;
+    CLAuxTensorHandler(const CLAuxTensorHandler &)          = delete;
     CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete;
 
     ~CLAuxTensorHandler()
     {
-        if(_injected_tensor_pack)
+        if (_injected_tensor_pack)
         {
             _injected_tensor_pack->remove_tensor(_injected_slot_id);
         }
@@ -103,9 +103,9 @@
 
 private:
     CLTensor     _tensor{};
-    ITensorPack *_injected_tensor_pack{ nullptr };
-    int          _injected_slot_id{ TensorType::ACL_UNKNOWN };
+    ITensorPack *_injected_tensor_pack{nullptr};
+    int          _injected_slot_id{TensorType::ACL_UNKNOWN};
 };
 } // namespace opencl
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
\ No newline at end of file
+#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */

diff --git a/src/graph/DataLayerVisitor.cpp b/src/graph/DataLayerVisitor.cpp
index 073ffd4..f0fac25 100644
--- a/src/graph/DataLayerVisitor.cpp
+++ b/src/graph/DataLayerVisitor.cpp

@@ -25,8 +25,8 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/TypePrinter.h"
 
 namespace arm_compute
 {
@@ -43,17 +43,14 @@
     layer_data["data_layout"] = to_string(layout);
     // Add padding info
     std::ostringstream padding;
-    padding << "[" << to_string(ps_info.pad_left()) << ","
-            << to_string(ps_info.pad_top()) << ","
-            << to_string(ps_info.pad_bottom()) << ","
-            << to_string(ps_info.pad_right()) << "]";
+    padding << "[" << to_string(ps_info.pad_left()) << "," << to_string(ps_info.pad_top()) << ","
+            << to_string(ps_info.pad_bottom()) << "," << to_string(ps_info.pad_right()) << "]";
 
     layer_data["pad"] = padding.str();
 
     // Add stride info
     std::ostringstream stride;
-    stride << "[" << to_string(ps_info.stride().first) << ","
-           << to_string(ps_info.stride().second) << "]";
+    stride << "[" << to_string(ps_info.stride().first) << "," << to_string(ps_info.stride().second) << "]";
 
     layer_data["stride"] = stride.str();
 
@@ -68,12 +65,12 @@
 
     // Change input names for weights / bias (if applicable)
     // Assumes input(1) is weights and input(2) is bias
-    if(layer_data.count("input_shape1"))
+    if (layer_data.count("input_shape1"))
     {
         layer_data["weights_shape"] = layer_data["input_shape1"];
         layer_data.erase("input_shape1");
     }
-    if(layer_data.count("input_shape2"))
+    if (layer_data.count("input_shape2"))
     {
         layer_data["bias_shape"] = layer_data["input_shape2"];
         layer_data.erase("input_shape2");
@@ -92,16 +89,17 @@
 void add_generic_layer_data(DataLayerVisitor::LayerData &layer_data, T &node)
 {
     // Loop over each input tensor
-    for(size_t tensor_no = 0; tensor_no < node.num_inputs(); ++tensor_no)
+    for (size_t tensor_no = 0; tensor_no < node.num_inputs(); ++tensor_no)
     {
         // Add input tensor shapes
-        if(node.input(tensor_no) != nullptr)
+        if (node.input(tensor_no) != nullptr)
         {
-            layer_data["input_shape" + to_string(tensor_no)] = "[" + to_string(node.input(tensor_no)->desc().shape) + "]";
+            layer_data["input_shape" + to_string(tensor_no)] =
+                "[" + to_string(node.input(tensor_no)->desc().shape) + "]";
         }
     }
     // Add output tensor shape
-    if(node.output(0) != nullptr)
+    if (node.output(0) != nullptr)
     {
         layer_data["output_shape0"] = "[" + to_string(node.output(0)->desc().shape) + "]";
     }

diff --git a/src/graph/Graph.cpp b/src/graph/Graph.cpp
index 4ce5358..3ae83f2 100644
--- a/src/graph/Graph.cpp
+++ b/src/graph/Graph.cpp

@@ -34,24 +34,24 @@
 
 bool Graph::remove_node(NodeID nid)
 {
-    if(nid >= _nodes.size())
+    if (nid >= _nodes.size())
     {
         return false;
     }
 
     std::unique_ptr<INode> &node = _nodes[nid];
 
-    if(node)
+    if (node)
     {
         // Remove input connections
-        for(auto &input_eid : node->_input_edges)
+        for (auto &input_eid : node->_input_edges)
         {
             remove_connection(input_eid);
         }
 
         // Remove output connections
         std::set<EdgeID> output_edges_copy = node->output_edges();
-        for(auto &output_eid : output_edges_copy)
+        for (auto &output_eid : output_edges_copy)
         {
             remove_connection(output_eid);
         }
@@ -71,8 +71,10 @@
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
 
     // Check if node index is valid, if node exists and finally if the connection index is valid
-    ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) || (source_idx >= _nodes[source]->num_outputs()));
-    ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) || (sink_idx >= _nodes[sink]->num_inputs()));
+    ARM_COMPUTE_ERROR_ON((source >= _nodes.size()) || (_nodes[source] == nullptr) ||
+                         (source_idx >= _nodes[source]->num_outputs()));
+    ARM_COMPUTE_ERROR_ON((sink >= _nodes.size()) || (_nodes[sink] == nullptr) ||
+                         (sink_idx >= _nodes[sink]->num_inputs()));
 
     // Get nodes
     std::unique_ptr<INode> &source_node = _nodes[source];
@@ -80,23 +82,25 @@
 
     // Check for duplicate connections (Check only sink node)
     Edge *sink_node_edge = sink_node->input_edge(sink_idx);
-    if((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) && (sink_node_edge->producer_idx() == source_idx)
-       && (sink_node_edge->consumer_id() == sink) && (sink_node_edge->consumer_idx() == sink_idx))
+    if ((sink_node_edge != nullptr) && (sink_node_edge->producer_id() == source) &&
+        (sink_node_edge->producer_idx() == source_idx) && (sink_node_edge->consumer_id() == sink) &&
+        (sink_node_edge->consumer_idx() == sink_idx))
     {
         return sink_node_edge->id();
     }
 
     // Check if there is already a tensor associated with output if not create one
     TensorID tid = source_node->output_id(source_idx);
-    if(tid == NullTensorID)
+    if (tid == NullTensorID)
     {
         tid = create_tensor();
     }
     std::unique_ptr<Tensor> &tensor = _tensors[tid];
 
     // Create connections
-    EdgeID eid        = _edges.size();
-    auto   connection = std::make_unique<Edge>(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get());
+    EdgeID eid = _edges.size();
+    auto   connection =
+        std::make_unique<Edge>(eid, source_node.get(), source_idx, sink_node.get(), sink_idx, tensor.get());
     _edges.push_back(std::move(connection));
 
     // Add connections to source and sink nodes
@@ -117,7 +121,7 @@
 
 bool Graph::remove_connection(EdgeID eid)
 {
-    if(eid >= _edges.size())
+    if (eid >= _edges.size())
     {
         return false;
     }
@@ -125,22 +129,22 @@
     std::unique_ptr<Edge> &edge = _edges[eid];
 
     // Remove node connections
-    if(edge != nullptr)
+    if (edge != nullptr)
     {
         // Get tensor bound to the edge
-        if(edge->tensor() != nullptr)
+        if (edge->tensor() != nullptr)
         {
             edge->tensor()->unbind_edge(eid);
         }
 
         // Remove edges from source node
-        if(edge->producer() != nullptr)
+        if (edge->producer() != nullptr)
         {
             edge->producer()->_output_edges.erase(eid);
         }
 
         // Remove edges from sink node
-        if((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size()))
+        if ((edge->consumer() != nullptr) && (edge->consumer_idx() < edge->consumer()->_input_edges.size()))
         {
             edge->consumer()->_input_edges[edge->consumer_idx()] = EmptyEdgeID;
         }
@@ -231,4 +235,4 @@
     return (id >= _tensors.size()) ? nullptr : _tensors[id].get();
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/GraphBuilder.cpp b/src/graph/GraphBuilder.cpp
index 7e5d313..eab91b2 100644
--- a/src/graph/GraphBuilder.cpp
+++ b/src/graph/GraphBuilder.cpp

@@ -24,10 +24,10 @@
 #include "arm_compute/graph/GraphBuilder.h"
 
 #include "arm_compute/core/utils/DataTypeUtils.h"
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
+#include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/ToolchainSupport.h"
 
@@ -41,7 +41,8 @@
 {
     ARM_COMPUTE_UNUSED(pair);
     ARM_COMPUTE_UNUSED(g);
-    ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) || (pair.index >= g.node(pair.node_id)->num_outputs()));
+    ARM_COMPUTE_ERROR_ON((pair.node_id >= g.nodes().size()) || (g.node((pair).node_id) == nullptr) ||
+                         (pair.index >= g.node(pair.node_id)->num_outputs()));
 }
 
 Status set_node_params(Graph &g, NodeID nid, NodeParams &params)
@@ -67,7 +68,8 @@
     return Status{};
 }
 
-NodeID add_const_node_with_name(Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
+NodeID add_const_node_with_name(
+    Graph &g, NodeParams params, const std::string &name, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     params.name = params.name.empty() ? "" : params.name + name;
     auto nid    = GraphBuilder::add_const_node(g, params, desc, std::move(accessor));
@@ -76,7 +78,7 @@
 }
 
 template <typename NT, typename... Args>
-NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, NodeIdxPair input, Args &&... args)
+NodeID create_simple_single_input_output_node(Graph &g, NodeParams &params, NodeIdxPair input, Args &&...args)
 {
     check_nodeidx_pair(input, g);
 
@@ -88,14 +90,17 @@
 }
 
 template <typename NT, typename... Args>
-NodeID create_simple_multiple_input_single_output_node(Graph &g, NodeParams &params, const std::vector<NodeIdxPair> &inputs, Args &&... args)
+NodeID create_simple_multiple_input_single_output_node(Graph                          &g,
+                                                       NodeParams                     &params,
+                                                       const std::vector<NodeIdxPair> &inputs,
+                                                       Args &&...args)
 {
     ARM_COMPUTE_ERROR_ON(inputs.size() == 0);
 
     NodeID nid = g.add_node<NT>(std::forward<Args>(args)...);
 
     unsigned int i = 0;
-    for(const auto &input : inputs)
+    for (const auto &input : inputs)
     {
         check_nodeidx_pair(input, g);
         g.add_connection(input.node_id, input.index, nid, i++);
@@ -106,7 +111,8 @@
 }
 } // namespace
 
-NodeID GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
+NodeID
+GraphBuilder::add_const_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     auto nid = g.add_node<ConstNode>(desc);
     set_node_params(g, nid, params);
@@ -114,7 +120,8 @@
     return nid;
 }
 
-NodeID GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
+NodeID
+GraphBuilder::add_input_node(Graph &g, NodeParams params, const TensorDescriptor &desc, ITensorAccessorUPtr accessor)
 {
     auto nid = g.add_node<InputNode>(desc);
     set_node_params(g, nid, params);
@@ -134,21 +141,35 @@
     return nid;
 }
 
-NodeID GraphBuilder::add_activation_node(Graph &g, NodeParams params, NodeIdxPair input, ActivationLayerInfo act_info,
+NodeID GraphBuilder::add_activation_node(Graph                  &g,
+                                         NodeParams              params,
+                                         NodeIdxPair             input,
+                                         ActivationLayerInfo     act_info,
                                          const QuantizationInfo &out_quant_info)
 {
     return create_simple_single_input_output_node<ActivationLayerNode>(g, params, input, act_info, out_quant_info);
 }
 
-NodeID GraphBuilder::add_arg_min_max_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, unsigned int axis,
-                                          DataType out_data_type, const QuantizationInfo &out_quant_info)
+NodeID GraphBuilder::add_arg_min_max_node(Graph                  &g,
+                                          NodeParams              params,
+                                          NodeIdxPair             input,
+                                          ReductionOperation      op,
+                                          unsigned int            axis,
+                                          DataType                out_data_type,
+                                          const QuantizationInfo &out_quant_info)
 {
-    return create_simple_single_input_output_node<ArgMinMaxLayerNode>(g, params, input, op, axis, out_data_type, out_quant_info);
+    return create_simple_single_input_output_node<ArgMinMaxLayerNode>(g, params, input, op, axis, out_data_type,
+                                                                      out_quant_info);
 }
 
-NodeID GraphBuilder::add_batch_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, float epsilon,
-                                                  ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr var_accessor,
-                                                  ITensorAccessorUPtr beta_accessor, ITensorAccessorUPtr gamma_accessor)
+NodeID GraphBuilder::add_batch_normalization_node(Graph              &g,
+                                                  NodeParams          params,
+                                                  NodeIdxPair         input,
+                                                  float               epsilon,
+                                                  ITensorAccessorUPtr mean_accessor,
+                                                  ITensorAccessorUPtr var_accessor,
+                                                  ITensorAccessorUPtr beta_accessor,
+                                                  ITensorAccessorUPtr gamma_accessor)
 {
     check_nodeidx_pair(input, g);
 
@@ -168,14 +189,14 @@
 
     // Create beta node
     NodeID beta_nid = EmptyNodeID;
-    if(has_beta)
+    if (has_beta)
     {
         beta_nid = add_const_node_with_name(g, params, "Beta", common_desc, std::move(beta_accessor));
     }
 
     // Create gamma node
     NodeID gamma_nid = EmptyNodeID;
-    if(has_gamma)
+    if (has_gamma)
     {
         gamma_nid = add_const_node_with_name(g, params, "Gamma", common_desc, std::move(gamma_accessor));
     }
@@ -185,11 +206,11 @@
     g.add_connection(input.node_id, input.index, batch_norm_nid, 0);
     g.add_connection(mean_nid, 0, batch_norm_nid, 1);
     g.add_connection(var_nid, 0, batch_norm_nid, 2);
-    if(has_beta)
+    if (has_beta)
     {
         g.add_connection(beta_nid, 0, batch_norm_nid, 3);
     }
-    if(has_gamma)
+    if (has_gamma)
     {
         g.add_connection(gamma_nid, 0, batch_norm_nid, 4);
     }
@@ -198,7 +219,8 @@
     return batch_norm_nid;
 }
 
-NodeID GraphBuilder::add_bounding_box_transform_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
+NodeID GraphBuilder::add_bounding_box_transform_node(
+    Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair deltas, BoundingBoxTransformInfo info)
 {
     check_nodeidx_pair(input, g);
     check_nodeidx_pair(deltas, g);
@@ -217,10 +239,17 @@
     return create_simple_single_input_output_node<ChannelShuffleLayerNode>(g, params, input, num_groups);
 }
 
-NodeID GraphBuilder::add_convolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                          Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo conv_info,
-                                          unsigned int num_groups, ConvolutionMethod method, FastMathHint fast_math_hint,
-                                          ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
+NodeID GraphBuilder::add_convolution_node(Graph                  &g,
+                                          NodeParams              params,
+                                          NodeIdxPair             input,
+                                          Size2D                  kernel_spatial_extend,
+                                          unsigned int            depth,
+                                          PadStrideInfo           conv_info,
+                                          unsigned int            num_groups,
+                                          ConvolutionMethod       method,
+                                          FastMathHint            fast_math_hint,
+                                          ITensorAccessorUPtr     weights_accessor,
+                                          ITensorAccessorUPtr     bias_accessor,
                                           const QuantizationInfo &weights_quant_info,
                                           const QuantizationInfo &out_quant_info)
 {
@@ -241,7 +270,7 @@
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
                      get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) / num_groups);
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::BATCHES), depth);
-    if(!weights_quant_info.empty())
+    if (!weights_quant_info.empty())
     {
         w_desc.quant_info = weights_quant_info;
     }
@@ -250,11 +279,11 @@
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(depth);
-        if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
+        if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -265,7 +294,7 @@
     NodeID conv_nid = g.add_node<ConvolutionLayerNode>(conv_info, num_groups, method, fast_math_hint, out_quant_info);
     g.add_connection(input.node_id, input.index, conv_nid, 0);
     g.add_connection(w_nid, 0, conv_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, conv_nid, 2);
     }
@@ -274,8 +303,12 @@
     return conv_nid;
 }
 
-NodeID GraphBuilder::add_deconvolution_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                            Size2D kernel_spatial_extend, unsigned int depth, PadStrideInfo deconv_info,
+NodeID GraphBuilder::add_deconvolution_node(Graph              &g,
+                                            NodeParams          params,
+                                            NodeIdxPair         input,
+                                            Size2D              kernel_spatial_extend,
+                                            unsigned int        depth,
+                                            PadStrideInfo       deconv_info,
                                             ITensorAccessorUPtr weights_accessor,
                                             ITensorAccessorUPtr bias_accessor)
 {
@@ -301,11 +334,11 @@
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(depth);
-        if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
+        if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -313,10 +346,10 @@
     }
 
     // Create convolution node and connect
-    NodeID deconv_nid = g.add_node<DeconvolutionLayerNode>(descriptors::DeconvolutionLayerDescriptor{ deconv_info });
+    NodeID deconv_nid = g.add_node<DeconvolutionLayerNode>(descriptors::DeconvolutionLayerDescriptor{deconv_info});
     g.add_connection(input.node_id, input.index, deconv_nid, 0);
     g.add_connection(w_nid, 0, deconv_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, deconv_nid, 2);
     }
@@ -325,14 +358,26 @@
     return deconv_nid;
 }
 
-NodeID GraphBuilder::add_concatenate_node(Graph &g, NodeParams params, const std::vector<NodeIdxPair> &inputs, const descriptors::ConcatLayerDescriptor &concat_descriptor)
+NodeID GraphBuilder::add_concatenate_node(Graph                                    &g,
+                                          NodeParams                                params,
+                                          const std::vector<NodeIdxPair>           &inputs,
+                                          const descriptors::ConcatLayerDescriptor &concat_descriptor)
 {
-    return create_simple_multiple_input_single_output_node<ConcatenateLayerNode>(g, params, inputs, inputs.size(), concat_descriptor);
+    return create_simple_multiple_input_single_output_node<ConcatenateLayerNode>(g, params, inputs, inputs.size(),
+                                                                                 concat_descriptor);
 }
 
-NodeID GraphBuilder::add_depthwise_convolution_node(Graph &g, NodeParams params, NodeIdxPair input, Size2D kernel_spatial_extend,
-                                                    PadStrideInfo conv_info, int depth_multiplier, DepthwiseConvolutionMethod method,
-                                                    ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor, const QuantizationInfo &quant_info, const QuantizationInfo &out_quant_info)
+NodeID GraphBuilder::add_depthwise_convolution_node(Graph                     &g,
+                                                    NodeParams                 params,
+                                                    NodeIdxPair                input,
+                                                    Size2D                     kernel_spatial_extend,
+                                                    PadStrideInfo              conv_info,
+                                                    int                        depth_multiplier,
+                                                    DepthwiseConvolutionMethod method,
+                                                    ITensorAccessorUPtr        weights_accessor,
+                                                    ITensorAccessorUPtr        bias_accessor,
+                                                    const QuantizationInfo    &quant_info,
+                                                    const QuantizationInfo    &out_quant_info)
 {
     check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON((kernel_spatial_extend.width == 0) || (kernel_spatial_extend.height == 0));
@@ -349,7 +394,7 @@
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), kernel_spatial_extend.height);
     w_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL),
                      get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
-    if(!quant_info.empty())
+    if (!quant_info.empty())
     {
         w_desc.quant_info = quant_info;
     }
@@ -358,12 +403,13 @@
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
-        b_desc.shape            = TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
+        b_desc.shape =
+            TensorShape(get_dimension_size(input_tensor_desc, DataLayoutDimension::CHANNEL) * depth_multiplier);
 
-        if(is_data_type_quantized_asymmetric(b_desc.data_type))
+        if (is_data_type_quantized_asymmetric(b_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -375,7 +421,7 @@
     NodeID conv_nid = g.add_node<DepthwiseConvolutionLayerNode>(conv_info, depth_multiplier, method, out_quant_info);
     g.add_connection(input.node_id, input.index, conv_nid, 0);
     g.add_connection(w_nid, 0, conv_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, conv_nid, 2);
     }
@@ -394,7 +440,12 @@
     return create_simple_single_input_output_node<DequantizationLayerNode>(g, params, input);
 }
 
-NodeID GraphBuilder::add_detection_output_node(Graph &g, NodeParams params, NodeIdxPair input_loc, NodeIdxPair input_conf, NodeIdxPair input_priorbox, const DetectionOutputLayerInfo &detect_info)
+NodeID GraphBuilder::add_detection_output_node(Graph                          &g,
+                                               NodeParams                      params,
+                                               NodeIdxPair                     input_loc,
+                                               NodeIdxPair                     input_conf,
+                                               NodeIdxPair                     input_priorbox,
+                                               const DetectionOutputLayerInfo &detect_info)
 {
     check_nodeidx_pair(input_loc, g);
     check_nodeidx_pair(input_conf, g);
@@ -411,18 +462,24 @@
     return detect_nid;
 }
 
-NodeID GraphBuilder::add_detection_post_process_node(Graph &g, NodeParams params, NodeIdxPair input_box_encoding, NodeIdxPair input_class_prediction, const DetectionPostProcessLayerInfo &detect_info,
-                                                     ITensorAccessorUPtr anchors_accessor, const QuantizationInfo &anchor_quant_info)
+NodeID GraphBuilder::add_detection_post_process_node(Graph                               &g,
+                                                     NodeParams                           params,
+                                                     NodeIdxPair                          input_box_encoding,
+                                                     NodeIdxPair                          input_class_prediction,
+                                                     const DetectionPostProcessLayerInfo &detect_info,
+                                                     ITensorAccessorUPtr                  anchors_accessor,
+                                                     const QuantizationInfo              &anchor_quant_info)
 {
     check_nodeidx_pair(input_box_encoding, g);
     check_nodeidx_pair(input_class_prediction, g);
 
     // Get input tensor descriptor
-    const TensorDescriptor input_box_encoding_tensor_desc = get_tensor_descriptor(g, g.node(input_box_encoding.node_id)->outputs()[0]);
+    const TensorDescriptor input_box_encoding_tensor_desc =
+        get_tensor_descriptor(g, g.node(input_box_encoding.node_id)->outputs()[0]);
 
     // Calculate anchor descriptor
     TensorDescriptor anchor_desc = input_box_encoding_tensor_desc;
-    if(!anchor_quant_info.empty())
+    if (!anchor_quant_info.empty())
     {
         anchor_desc.quant_info = anchor_quant_info;
     }
@@ -446,12 +503,13 @@
     return create_simple_single_input_output_node<DummyNode>(g, params, input, shape);
 }
 
-NodeID GraphBuilder::add_elementwise_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
+NodeID GraphBuilder::add_elementwise_node(
+    Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, EltwiseOperation operation)
 {
     check_nodeidx_pair(input0, g);
     check_nodeidx_pair(input1, g);
 
-    NodeID nid = g.add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{ operation });
+    NodeID nid = g.add_node<EltwiseLayerNode>(descriptors::EltwiseLayerDescriptor{operation});
 
     g.add_connection(input0.node_id, input0.index, nid, 0);
     g.add_connection(input1.node_id, input1.index, nid, 1);
@@ -466,9 +524,15 @@
     return create_simple_single_input_output_node<FlattenLayerNode>(g, params, input);
 }
 
-NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                               NodeID weights_nid, NodeID bias_nid,
-                                               const FullyConnectedLayerInfo fc_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint)
+NodeID GraphBuilder::add_fully_connected_layer(Graph                        &g,
+                                               NodeParams                    params,
+                                               NodeIdxPair                   input,
+                                               unsigned int                  num_outputs,
+                                               NodeID                        weights_nid,
+                                               NodeID                        bias_nid,
+                                               const FullyConnectedLayerInfo fc_info,
+                                               const QuantizationInfo       &out_quant_info,
+                                               FastMathHint                  fast_math_hint)
 {
     check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON(num_outputs == 0);
@@ -483,7 +547,7 @@
     NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs, out_quant_info, fc_info, fast_math_hint);
     g.add_connection(input.node_id, input.index, fc_nid, 0);
     g.add_connection(weights_nid, 0, fc_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(bias_nid, 0, fc_nid, 2);
     }
@@ -493,10 +557,16 @@
     return fc_nid;
 }
 
-NodeID GraphBuilder::add_fully_connected_layer(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_outputs,
-                                               ITensorAccessorUPtr weights_accessor, ITensorAccessorUPtr bias_accessor,
+NodeID GraphBuilder::add_fully_connected_layer(Graph                        &g,
+                                               NodeParams                    params,
+                                               NodeIdxPair                   input,
+                                               unsigned int                  num_outputs,
+                                               ITensorAccessorUPtr           weights_accessor,
+                                               ITensorAccessorUPtr           bias_accessor,
                                                const FullyConnectedLayerInfo fc_info,
-                                               const QuantizationInfo &weights_quant_info, const QuantizationInfo &out_quant_info, FastMathHint fast_math_hint)
+                                               const QuantizationInfo       &weights_quant_info,
+                                               const QuantizationInfo       &out_quant_info,
+                                               FastMathHint                  fast_math_hint)
 {
     check_nodeidx_pair(input, g);
     ARM_COMPUTE_ERROR_ON(num_outputs == 0);
@@ -507,16 +577,17 @@
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
 
     // Create weights node
-    TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs, fc_info, weights_quant_info);
+    TensorDescriptor w_desc = FullyConnectedLayerNode::compute_weights_descriptor(input_tensor_desc, num_outputs,
+                                                                                  fc_info, weights_quant_info);
     NodeID           w_nid  = add_const_node_with_name(g, params, "Weights", w_desc, std::move(weights_accessor));
 
     // Create bias nodes
     NodeID b_nid = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         TensorDescriptor b_desc = input_tensor_desc;
         b_desc.shape            = TensorShape(num_outputs);
-        if(is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
+        if (is_data_type_quantized_asymmetric(input_tensor_desc.data_type))
         {
             b_desc.data_type = DataType::S32;
         }
@@ -527,7 +598,7 @@
     NodeID fc_nid = g.add_node<FullyConnectedLayerNode>(num_outputs, out_quant_info, fc_info, fast_math_hint);
     g.add_connection(input.node_id, input.index, fc_nid, 0);
     g.add_connection(w_nid, 0, fc_nid, 1);
-    if(has_bias)
+    if (has_bias)
     {
         g.add_connection(b_nid, 0, fc_nid, 2);
     }
@@ -537,7 +608,12 @@
     return fc_nid;
 }
 
-NodeID GraphBuilder::add_generate_proposals_node(Graph &g, NodeParams params, NodeIdxPair scores, NodeIdxPair deltas, NodeIdxPair anchors, GenerateProposalsInfo info)
+NodeID GraphBuilder::add_generate_proposals_node(Graph                &g,
+                                                 NodeParams            params,
+                                                 NodeIdxPair           scores,
+                                                 NodeIdxPair           deltas,
+                                                 NodeIdxPair           anchors,
+                                                 GenerateProposalsInfo info)
 {
     check_nodeidx_pair(scores, g);
     check_nodeidx_pair(deltas, g);
@@ -558,13 +634,14 @@
     return create_simple_single_input_output_node<L2NormalizeLayerNode>(g, params, input, axis, epsilon);
 }
 
-NodeID GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
+NodeID
+GraphBuilder::add_normalization_node(Graph &g, NodeParams params, NodeIdxPair input, NormalizationLayerInfo norm_info)
 {
     return create_simple_single_input_output_node<NormalizationLayerNode>(g, params, input, norm_info);
 }
 
-NodeID GraphBuilder::add_normalize_planar_yuv_node(Graph &g, NodeParams params, NodeIdxPair input,
-                                                   ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
+NodeID GraphBuilder::add_normalize_planar_yuv_node(
+    Graph &g, NodeParams params, NodeIdxPair input, ITensorAccessorUPtr mean_accessor, ITensorAccessorUPtr std_accessor)
 {
     check_nodeidx_pair(input, g);
 
@@ -589,12 +666,14 @@
     return norm_planar_yuv_nid;
 }
 
-NodeID GraphBuilder::add_pad_node(Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value)
+NodeID GraphBuilder::add_pad_node(
+    Graph &g, NodeParams params, NodeIdxPair input, const PaddingList &paddings, PixelValue pad_value)
 {
     return create_simple_single_input_output_node<PadLayerNode>(g, params, input, paddings, pad_value);
 }
 
-NodeID GraphBuilder::add_permute_node(Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout)
+NodeID GraphBuilder::add_permute_node(
+    Graph &g, NodeParams params, NodeIdxPair input, PermutationVector perm, DataLayout layout)
 {
     return create_simple_single_input_output_node<PermuteLayerNode>(g, params, input, perm, layout);
 }
@@ -618,12 +697,18 @@
     return create_simple_single_input_output_node<PoolingLayerNode>(g, params, input, pool_info);
 }
 
-NodeID GraphBuilder::add_print_node(Graph &g, NodeParams params, NodeIdxPair input, std::ostream &stream, const IOFormatInfo &format_info, const std::function<ITensor *(ITensor *)> transform)
+NodeID GraphBuilder::add_print_node(Graph                                    &g,
+                                    NodeParams                                params,
+                                    NodeIdxPair                               input,
+                                    std::ostream                             &stream,
+                                    const IOFormatInfo                       &format_info,
+                                    const std::function<ITensor *(ITensor *)> transform)
 {
     return create_simple_single_input_output_node<PrintLayerNode>(g, params, input, stream, format_info, transform);
 }
 
-NodeID GraphBuilder::add_priorbox_node(Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info)
+NodeID GraphBuilder::add_priorbox_node(
+    Graph &g, NodeParams params, NodeIdxPair input0, NodeIdxPair input1, const PriorBoxLayerInfo &prior_info)
 {
     check_nodeidx_pair(input0, g);
     check_nodeidx_pair(input1, g);
@@ -638,12 +723,16 @@
     return prior_nid;
 }
 
-NodeID GraphBuilder::add_quantization_node(Graph &g, NodeParams params, NodeIdxPair input, const QuantizationInfo &out_quant_info)
+NodeID GraphBuilder::add_quantization_node(Graph                  &g,
+                                           NodeParams              params,
+                                           NodeIdxPair             input,
+                                           const QuantizationInfo &out_quant_info)
 {
     return create_simple_single_input_output_node<QuantizationLayerNode>(g, params, input, out_quant_info);
 }
 
-NodeID GraphBuilder::add_reduction_operation_node(Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims)
+NodeID GraphBuilder::add_reduction_operation_node(
+    Graph &g, NodeParams params, NodeIdxPair input, ReductionOperation op, int axis, bool keep_dims)
 {
     return create_simple_single_input_output_node<ReductionLayerNode>(g, params, input, op, axis, keep_dims);
 }
@@ -658,13 +747,14 @@
     return create_simple_single_input_output_node<ReshapeLayerNode>(g, params, input, shape);
 }
 
-NodeID GraphBuilder::add_resize_node(Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy,
-                                     float width_scale, float height_scale)
+NodeID GraphBuilder::add_resize_node(
+    Graph &g, NodeParams params, NodeIdxPair input, InterpolationPolicy policy, float width_scale, float height_scale)
 {
     return create_simple_single_input_output_node<ResizeLayerNode>(g, params, input, policy, width_scale, height_scale);
 }
 
-NodeID GraphBuilder::add_roi_align_node(Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
+NodeID GraphBuilder::add_roi_align_node(
+    Graph &g, NodeParams params, NodeIdxPair input, NodeIdxPair rois, ROIPoolingLayerInfo pool_info)
 {
     check_nodeidx_pair(input, g);
     check_nodeidx_pair(rois, g);
@@ -678,7 +768,11 @@
     return nid;
 }
 
-NodeID GraphBuilder::add_scale_layer(Graph &g, const NodeParams &params, NodeIdxPair input, ITensorAccessorUPtr mul_accessor, ITensorAccessorUPtr add_accessor)
+NodeID GraphBuilder::add_scale_layer(Graph              &g,
+                                     const NodeParams   &params,
+                                     NodeIdxPair         input,
+                                     ITensorAccessorUPtr mul_accessor,
+                                     ITensorAccessorUPtr add_accessor)
 {
     check_nodeidx_pair(input, g);
 
@@ -688,22 +782,23 @@
 
     // Create mul node
     TensorDescriptor mul_desc = input_tensor_desc;
-    const size_t     C        = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)];
+    const size_t     C = input_tensor_desc.shape[get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL)];
     mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::WIDTH), 1);
     mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::HEIGHT), 1);
     mul_desc.shape.set(get_dimension_idx(input_data_layout, DataLayoutDimension::CHANNEL), C);
     NodeID      mul_const_nid   = add_const_node_with_name(g, params, "Mul", mul_desc, std::move(mul_accessor));
-    NodeIdxPair mul_const_nidxp = { mul_const_nid, 0 };
+    NodeIdxPair mul_const_nidxp = {mul_const_nid, 0};
 
     // Create add node
     TensorDescriptor add_desc        = mul_desc;
     NodeID           add_const_nid   = add_const_node_with_name(g, params, "Add", add_desc, std::move(add_accessor));
-    NodeIdxPair      add_const_nidxp = { add_const_nid, 0 };
+    NodeIdxPair      add_const_nidxp = {add_const_nid, 0};
 
     // Create node and connect
-    NodeID      mul_node      = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::Mul);
-    NodeIdxPair mulnode_nidxp = { mul_node, 0 };
-    NodeID      add_node      = GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::Add);
+    NodeID      mul_node = GraphBuilder::add_elementwise_node(g, params, input, mul_const_nidxp, EltwiseOperation::Mul);
+    NodeIdxPair mulnode_nidxp = {mul_node, 0};
+    NodeID      add_node =
+        GraphBuilder::add_elementwise_node(g, params, mulnode_nidxp, add_const_nidxp, EltwiseOperation::Add);
 
     return add_node;
 }
@@ -713,17 +808,25 @@
     return create_simple_single_input_output_node<SoftmaxLayerNode>(g, params, input, beta);
 }
 
-NodeID GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends)
+NodeID
+GraphBuilder::add_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends)
 {
     return create_simple_single_input_output_node<SliceLayerNode>(g, params, input, starts, ends);
 }
 
-NodeID GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
+NodeID
+GraphBuilder::add_split_node(Graph &g, NodeParams params, NodeIdxPair input, unsigned int num_splits, unsigned int axis)
 {
     return create_simple_single_input_output_node<SplitLayerNode>(g, params, input, num_splits, axis);
 }
 
-NodeID GraphBuilder::add_strided_slice_node(Graph &g, NodeParams params, NodeIdxPair input, Coordinates &starts, Coordinates &ends, BiStrides &strides, StridedSliceLayerInfo info)
+NodeID GraphBuilder::add_strided_slice_node(Graph                &g,
+                                            NodeParams            params,
+                                            NodeIdxPair           input,
+                                            Coordinates          &starts,
+                                            Coordinates          &ends,
+                                            BiStrides            &strides,
+                                            StridedSliceLayerInfo info)
 {
     return create_simple_single_input_output_node<StridedSliceLayerNode>(g, params, input, starts, ends, strides, info);
 }
@@ -770,7 +873,8 @@
     g.add_connection(input.node_id, input.index, cls, 0);
     g.add_connection(cls, 0, cls_act, 0);
 
-    NodeID concat = g.add_node<ConcatenateLayerNode>(3, descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL));
+    NodeID concat =
+        g.add_node<ConcatenateLayerNode>(3, descriptors::ConcatLayerDescriptor(DataLayoutDimension::CHANNEL));
     set_node_params(g, concat, params);
     g.add_connection(act_box, 0, concat, 0);
     g.add_connection(imm, 0, concat, 1);

diff --git a/src/graph/GraphContext.cpp b/src/graph/GraphContext.cpp
index 7b74c2f..10850aa 100644
--- a/src/graph/GraphContext.cpp
+++ b/src/graph/GraphContext.cpp

@@ -24,15 +24,14 @@
 #include "arm_compute/graph/GraphContext.h"
 
 #include "arm_compute/graph.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Utils.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-GraphContext::GraphContext()
-    : _config(), _memory_managers(), _weights_managers()
+GraphContext::GraphContext() : _config(), _memory_managers(), _weights_managers()
 {
 }
 
@@ -56,7 +55,7 @@
 bool GraphContext::insert_memory_management_ctx(MemoryManagerContext &&memory_ctx)
 {
     Target target = memory_ctx.target;
-    if(target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers))
+    if (target == Target::UNSPECIFIED || _memory_managers.find(target) != std::end(_memory_managers))
     {
         return false;
     }
@@ -79,7 +78,7 @@
 {
     Target target = weights_managers.target;
 
-    if(_weights_managers.find(target) != std::end(_weights_managers))
+    if (_weights_managers.find(target) != std::end(_weights_managers))
     {
         return false;
     }
@@ -102,17 +101,17 @@
 void GraphContext::finalize()
 {
     const size_t num_pools = 1;
-    for(auto &mm_obj : _memory_managers)
+    for (auto &mm_obj : _memory_managers)
     {
         ARM_COMPUTE_ERROR_ON(!mm_obj.second.allocator);
 
         // Finalize intra layer memory manager
-        if(mm_obj.second.intra_mm != nullptr)
+        if (mm_obj.second.intra_mm != nullptr)
         {
             mm_obj.second.intra_mm->populate(*mm_obj.second.allocator, num_pools);
         }
         // Finalize cross layer memory manager
-        if(mm_obj.second.cross_mm != nullptr)
+        if (mm_obj.second.cross_mm != nullptr)
         {
             mm_obj.second.cross_mm->populate(*mm_obj.second.allocator, num_pools);
         }

diff --git a/src/graph/GraphManager.cpp b/src/graph/GraphManager.cpp
index 45b608c..58ae60d 100644
--- a/src/graph/GraphManager.cpp
+++ b/src/graph/GraphManager.cpp

@@ -23,15 +23,15 @@
  */
 #include "arm_compute/graph/GraphManager.h"
 
+#include "arm_compute/graph/algorithms/TopologicalSort.h"
+#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
+#include "arm_compute/graph/detail/ExecutionHelpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/PassManager.h"
 #include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/algorithms/TopologicalSort.h"
-#include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
-#include "arm_compute/graph/detail/ExecutionHelpers.h"
 
 #include "src/common/utils/Log.h"
 
@@ -39,8 +39,7 @@
 {
 namespace graph
 {
-GraphManager::GraphManager()
-    : _workloads()
+GraphManager::GraphManager() : _workloads()
 {
 }
 
@@ -49,7 +48,7 @@
     ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Initiate graph configuration!");
 
     // Check if graph has been registered
-    if(_workloads.find(graph.id()) != std::end(_workloads))
+    if (_workloads.find(graph.id()) != std::end(_workloads))
     {
         ARM_COMPUTE_ERROR("Graph is already registered!");
     }
@@ -62,7 +61,7 @@
 
     // In case CLVK is selected, use the CL backend and
     // update config
-    if(target == Target::CLVK)
+    if (target == Target::CLVK)
     {
         forced_target       = Target::CL;
         GraphConfig config  = ctx.config();
@@ -71,7 +70,7 @@
         ctx.set_config(config);
     }
 
-    if(!is_target_supported(target))
+    if (!is_target_supported(target))
     {
         forced_target = get_default_target();
         ARM_COMPUTE_LOG_GRAPH_INFO("Switching target from " << target << " to " << forced_target << std::endl);
@@ -105,7 +104,7 @@
     detail::prepare_all_tasks(workload);
 
     // Setup tensor memory (Allocate all tensors or setup transition manager)
-    if(ctx.config().use_transition_memory_manager)
+    if (ctx.config().use_transition_memory_manager)
     {
         detail::configure_transition_manager(graph, ctx, workload);
     }
@@ -130,10 +129,10 @@
     auto it = _workloads.find(graph.id());
     ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_workloads), "Graph is not registered!");
 
-    while(true)
+    while (true)
     {
         // Call input accessors
-        if(!detail::call_all_input_node_accessors(it->second))
+        if (!detail::call_all_input_node_accessors(it->second))
         {
             return;
         }
@@ -142,7 +141,7 @@
         detail::call_all_tasks(it->second);
 
         // Call output accessors
-        if(!detail::call_all_output_node_accessors(it->second))
+        if (!detail::call_all_output_node_accessors(it->second))
         {
             return;
         }
@@ -157,4 +156,4 @@
     _workloads.erase(it);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/INode.cpp b/src/graph/INode.cpp
index 70fe44e..83c3ef7 100644
--- a/src/graph/INode.cpp
+++ b/src/graph/INode.cpp

@@ -75,17 +75,17 @@
 
 void INode::set_output_tensor(TensorID tid, size_t idx)
 {
-    if(tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr))
+    if (tid != NullTensorID && (idx < _outputs.size()) && (_graph->tensor(tid) != nullptr))
     {
         ARM_COMPUTE_ERROR_ON(_graph == nullptr);
         Tensor *updated_tensor = _graph->tensor(tid);
         _outputs[idx]          = tid;
 
         // Set tensor to all output edges of the node
-        for(auto &output_edge_id : _output_edges)
+        for (auto &output_edge_id : _output_edges)
         {
             auto output_edge = _graph->edge(output_edge_id);
-            if(output_edge != nullptr)
+            if (output_edge != nullptr)
             {
                 // Unbind edge from current tensor
                 auto current_output_tensor = output_edge->tensor();

diff --git a/src/graph/INodeVisitor.cpp b/src/graph/INodeVisitor.cpp
index 5369f6f..90b2e33 100644
--- a/src/graph/INodeVisitor.cpp
+++ b/src/graph/INodeVisitor.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph/INodeVisitor.h"
+
 #include "arm_compute/graph/nodes/Nodes.h"
 
 namespace arm_compute

diff --git a/src/graph/PassManager.cpp b/src/graph/PassManager.cpp
index f7e214c..9a889e1 100644
--- a/src/graph/PassManager.cpp
+++ b/src/graph/PassManager.cpp

@@ -29,8 +29,7 @@
 {
 namespace graph
 {
-PassManager::PassManager()
-    : _passes()
+PassManager::PassManager() : _passes()
 {
 }
 
@@ -46,7 +45,7 @@
 
 void PassManager::append(std::unique_ptr<IGraphMutator> pass, bool conditional)
 {
-    if(pass && conditional)
+    if (pass && conditional)
     {
         ARM_COMPUTE_LOG_GRAPH_VERBOSE("Appending mutating pass : " << pass->name() << std::endl);
         _passes.push_back(std::move(pass));
@@ -60,9 +59,9 @@
 
 void PassManager::run_all(Graph &g)
 {
-    for(auto &pass : _passes)
+    for (auto &pass : _passes)
     {
-        if(pass)
+        if (pass)
         {
             ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
             pass->mutate(g);
@@ -72,9 +71,9 @@
 
 void PassManager::run_type(Graph &g, IGraphMutator::MutationType type)
 {
-    for(auto &pass : _passes)
+    for (auto &pass : _passes)
     {
-        if(pass && (pass->type() == type))
+        if (pass && (pass->type() == type))
         {
             ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
             pass->mutate(g);
@@ -84,17 +83,17 @@
 
 void PassManager::run_index(Graph &g, size_t index)
 {
-    if(index >= _passes.size())
+    if (index >= _passes.size())
     {
         return;
     }
 
     auto &pass = _passes.at(index);
-    if(pass != nullptr)
+    if (pass != nullptr)
     {
         ARM_COMPUTE_LOG_GRAPH_INFO("Running mutating pass : " << pass->name() << std::endl);
         pass->mutate(g);
     }
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/Tensor.cpp b/src/graph/Tensor.cpp
index 3d47234..72679c4 100644
--- a/src/graph/Tensor.cpp
+++ b/src/graph/Tensor.cpp

@@ -75,20 +75,20 @@
 bool Tensor::call_accessor()
 {
     // Early exit guard
-    if(!_accessor || !_handle)
+    if (!_accessor || !_handle)
     {
         return false;
     }
 
     const bool access_data = _accessor->access_tensor_data();
 
-    if(access_data)
+    if (access_data)
     {
         // Map tensor
         _handle->map(true);
 
         // Return in case of null backend buffer
-        if(_handle->tensor().buffer() == nullptr)
+        if (_handle->tensor().buffer() == nullptr)
         {
             return false;
         }
@@ -97,7 +97,7 @@
     // Call accessor
     bool retval = _accessor->access_tensor(_handle->tensor());
 
-    if(access_data)
+    if (access_data)
     {
         // Unmap tensor
         _handle->unmap();

diff --git a/src/graph/TypeLoader.cpp b/src/graph/TypeLoader.cpp
index 3c51289..e1248fb 100644
--- a/src/graph/TypeLoader.cpp
+++ b/src/graph/TypeLoader.cpp

@@ -31,10 +31,9 @@
 {
 arm_compute::DataLayout data_layout_from_name(const std::string &name)
 {
-    static const std::map<std::string, arm_compute::DataLayout> data_layouts =
-    {
-        { "nhwc", DataLayout::NHWC },
-        { "nchw", DataLayout::NCHW },
+    static const std::map<std::string, arm_compute::DataLayout> data_layouts = {
+        {"nhwc", DataLayout::NHWC},
+        {"nchw", DataLayout::NCHW},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -45,7 +44,7 @@
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
@@ -55,11 +54,10 @@
 {
 Target target_from_name(const std::string &name)
 {
-    static const std::map<std::string, Target> targets =
-    {
-        { "neon", Target::NEON },
-        { "cl", Target::CL },
-        { "clvk", Target::CLVK },
+    static const std::map<std::string, Target> targets = {
+        {"neon", Target::NEON},
+        {"cl", Target::CL},
+        {"clvk", Target::CLVK},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -70,7 +68,7 @@
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
@@ -79,12 +77,11 @@
 
 ConvolutionMethod Convolution_method_from_name(const std::string &name)
 {
-    static const std::map<std::string, ConvolutionMethod> methods =
-    {
-        { "default", ConvolutionMethod::Default },
-        { "direct", ConvolutionMethod::Direct },
-        { "gemm", ConvolutionMethod::GEMM },
-        { "winograd", ConvolutionMethod::Winograd },
+    static const std::map<std::string, ConvolutionMethod> methods = {
+        {"default", ConvolutionMethod::Default},
+        {"direct", ConvolutionMethod::Direct},
+        {"gemm", ConvolutionMethod::GEMM},
+        {"winograd", ConvolutionMethod::Winograd},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -95,7 +92,7 @@
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }
@@ -104,10 +101,9 @@
 
 DepthwiseConvolutionMethod depthwise_convolution_method_from_name(const std::string &name)
 {
-    static const std::map<std::string, DepthwiseConvolutionMethod> methods =
-    {
-        { "default", DepthwiseConvolutionMethod::Default },
-        { "optimized3x3", DepthwiseConvolutionMethod::Optimized3x3 },
+    static const std::map<std::string, DepthwiseConvolutionMethod> methods = {
+        {"default", DepthwiseConvolutionMethod::Default},
+        {"optimized3x3", DepthwiseConvolutionMethod::Optimized3x3},
     };
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
@@ -118,7 +114,7 @@
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::out_of_range &)
+    catch (const std::out_of_range &)
     {
         throw std::invalid_argument(name);
     }

diff --git a/src/graph/Utils.cpp b/src/graph/Utils.cpp
index dcab177..452d8ec 100644
--- a/src/graph/Utils.cpp
+++ b/src/graph/Utils.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/mutators/GraphMutators.h"
 
 namespace arm_compute
@@ -33,16 +33,17 @@
 {
 bool is_target_supported(Target target)
 {
-    return backends::BackendRegistry::get().contains(target) && backends::BackendRegistry::get().find_backend(target)->is_backend_supported();
+    return backends::BackendRegistry::get().contains(target) &&
+           backends::BackendRegistry::get().find_backend(target)->is_backend_supported();
 }
 
 Target get_default_target()
 {
-    if(is_target_supported(Target::NEON))
+    if (is_target_supported(Target::NEON))
     {
         return Target::NEON;
     }
-    if(is_target_supported(Target::CL))
+    if (is_target_supported(Target::CL))
     {
         return Target::CL;
     }
@@ -52,18 +53,18 @@
 void force_target_to_graph(Graph &g, Target target)
 {
     auto &nodes = g.nodes();
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
-        if(node)
+        if (node)
         {
             node->set_assigned_target(target);
         }
     }
 
     auto &tensors = g.tensors();
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor)
+        if (tensor)
         {
             tensor->desc().target = target;
         }
@@ -76,9 +77,9 @@
     PassManager pm;
 
     // Passes that mutate graph IR
-    if(cfg.use_synthetic_type)
+    if (cfg.use_synthetic_type)
     {
-        switch(cfg.synthetic_type)
+        switch (cfg.synthetic_type)
         {
             case DataType::QASYMM8:
             case DataType::QASYMM8_SIGNED:
@@ -107,9 +108,9 @@
 
 void release_default_graph_context(GraphContext &ctx)
 {
-    for(const auto &backend : backends::BackendRegistry::get().backends())
+    for (const auto &backend : backends::BackendRegistry::get().backends())
     {
-        if(backend.second->is_backend_supported())
+        if (backend.second->is_backend_supported())
         {
             backend.second->release_backend_context(ctx);
         }
@@ -118,9 +119,9 @@
 
 void sync_backends()
 {
-    for(const auto &backend : backends::BackendRegistry::get().backends())
+    for (const auto &backend : backends::BackendRegistry::get().backends())
     {
-        if(backend.second->backend_allocator())
+        if (backend.second->backend_allocator())
         {
             backend.second->sync();
         }
@@ -129,10 +130,10 @@
 
 void setup_requested_backend_context(GraphContext &ctx, Target target)
 {
-    if(backends::BackendRegistry::get().contains(target))
+    if (backends::BackendRegistry::get().contains(target))
     {
         const auto &backend = backends::BackendRegistry::get().find_backend(target);
-        if(backend->is_backend_supported())
+        if (backend->is_backend_supported())
         {
             backend->setup_backend_context(ctx);
         }
@@ -141,20 +142,22 @@
 
 size_t get_dimension_size(const TensorDescriptor &descriptor, const DataLayoutDimension data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(descriptor.layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
     return descriptor.shape[get_dimension_idx(descriptor.layout, data_layout_dimension)];
 }
 
 size_t get_dimension_idx(DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN, "Cannot retrieve the dimension index for an unknown layout!");
+    ARM_COMPUTE_ERROR_ON_MSG(data_layout == DataLayout::UNKNOWN,
+                             "Cannot retrieve the dimension index for an unknown layout!");
 
     /* Return the index based on the data layout
      * [N C H W]
      * [3 2 1 0]
      * [N H W C]
      */
-    switch(data_layout_dimension)
+    switch (data_layout_dimension)
     {
         case DataLayoutDimension::CHANNEL:
             return (data_layout == DataLayout::NCHW) ? 2 : 0;
@@ -181,13 +184,13 @@
     const Graph *g = node.graph();
     ARM_COMPUTE_ERROR_ON(g == nullptr);
 
-    for(auto &output_edge_id : node.output_edges())
+    for (auto &output_edge_id : node.output_edges())
     {
         auto output_edge = g->edge(output_edge_id);
-        if(output_edge != nullptr)
+        if (output_edge != nullptr)
         {
             ARM_COMPUTE_ERROR_ON(output_edge->consumer() == nullptr);
-            driving_nodes.push_back({ output_edge->consumer_id(), output_edge->consumer_idx() });
+            driving_nodes.push_back({output_edge->consumer_id(), output_edge->consumer_idx()});
         }
     }
 
@@ -201,13 +204,13 @@
     const Graph *g = node.graph();
     ARM_COMPUTE_ERROR_ON(g == nullptr);
 
-    for(auto &input_edge_id : node.input_edges())
+    for (auto &input_edge_id : node.input_edges())
     {
         auto input_edge = g->edge(input_edge_id);
-        if(input_edge != nullptr)
+        if (input_edge != nullptr)
         {
             ARM_COMPUTE_ERROR_ON(input_edge->producer() == nullptr);
-            driver_nodes.push_back({ input_edge->producer_id(), input_edge->producer_idx() });
+            driver_nodes.push_back({input_edge->producer_id(), input_edge->producer_idx()});
         }
     }
 
@@ -216,7 +219,7 @@
 
 void configure_tensor(Tensor *tensor)
 {
-    if(tensor != nullptr && tensor->handle() == nullptr)
+    if (tensor != nullptr && tensor->handle() == nullptr)
     {
         Target                         target  = tensor->desc().target;
         backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(target);

diff --git a/src/graph/Workload.cpp b/src/graph/Workload.cpp
index b9d5729..9dddad7 100644
--- a/src/graph/Workload.cpp
+++ b/src/graph/Workload.cpp

@@ -40,12 +40,12 @@
 
 void execute_task(ExecutionTask &task)
 {
-    if(task.task)
+    if (task.task)
     {
         task.task->run();
     }
 #ifdef ARM_COMPUTE_ASSERTS_ENABLED
-    else if(task.node->type() == NodeType::PrintLayer)
+    else if (task.node->type() == NodeType::PrintLayer)
     {
         auto print_node   = utils::cast::polymorphic_downcast<PrintLayerNode *>(task.node);
         auto input_handle = print_node->input(0)->handle();
@@ -61,14 +61,13 @@
 
 void ExecutionTask::prepare()
 {
-    if(task)
+    if (task)
     {
         task->prepare();
     }
 }
 
-TaskExecutor::TaskExecutor()
-    : execute_function(execute_task)
+TaskExecutor::TaskExecutor() : execute_function(execute_task)
 {
 }
 
@@ -78,4 +77,4 @@
     return executor;
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/algorithms/TopologicalSort.cpp b/src/graph/algorithms/TopologicalSort.cpp
index 3a69352..08e14e1 100644
--- a/src/graph/algorithms/TopologicalSort.cpp
+++ b/src/graph/algorithms/TopologicalSort.cpp

@@ -50,14 +50,14 @@
     ARM_COMPUTE_ERROR_ON(graph == nullptr);
 
     bool are_all_visited = true;
-    for(const auto &input_edge_id : node->input_edges())
+    for (const auto &input_edge_id : node->input_edges())
     {
-        if(input_edge_id != EmptyNodeID)
+        if (input_edge_id != EmptyNodeID)
         {
             const Edge *input_edge = graph->edge(input_edge_id);
             ARM_COMPUTE_ERROR_ON(input_edge == nullptr);
             ARM_COMPUTE_ERROR_ON(input_edge->producer() == nullptr);
-            if(!visited[input_edge->producer_id()])
+            if (!visited[input_edge->producer_id()])
             {
                 are_all_visited = false;
                 break;
@@ -80,9 +80,9 @@
     std::list<NodeID> queue;
 
     // Push inputs and mark as visited
-    for(auto &input : g.nodes(NodeType::Input))
+    for (auto &input : g.nodes(NodeType::Input))
     {
-        if(input != EmptyNodeID)
+        if (input != EmptyNodeID)
         {
             visited[input] = true;
             queue.push_back(input);
@@ -90,9 +90,9 @@
     }
 
     // Push const nodes and mark as visited
-    for(auto &const_node : g.nodes(NodeType::Const))
+    for (auto &const_node : g.nodes(NodeType::Const))
     {
-        if(const_node != EmptyNodeID)
+        if (const_node != EmptyNodeID)
         {
             visited[const_node] = true;
             queue.push_back(const_node);
@@ -100,7 +100,7 @@
     }
 
     // Iterate over vector and edges
-    while(!queue.empty())
+    while (!queue.empty())
     {
         // Dequeue a node from queue and process
         NodeID n = queue.front();
@@ -109,11 +109,11 @@
 
         const INode *node = g.node(n);
         ARM_COMPUTE_ERROR_ON(node == nullptr);
-        for(const auto &eid : node->output_edges())
+        for (const auto &eid : node->output_edges())
         {
             const Edge *e = g.edge(eid);
             ARM_COMPUTE_ERROR_ON(e == nullptr);
-            if(!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
+            if (!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
             {
                 visited[e->consumer_id()] = true;
                 queue.push_back(e->consumer_id());
@@ -135,9 +135,9 @@
     std::stack<NodeID> stack;
 
     // Push inputs and mark as visited
-    for(auto &input : g.nodes(NodeType::Input))
+    for (auto &input : g.nodes(NodeType::Input))
     {
-        if(input != EmptyNodeID)
+        if (input != EmptyNodeID)
         {
             visited[input] = true;
             stack.push(input);
@@ -145,9 +145,9 @@
     }
 
     // Push const nodes and mark as visited
-    for(auto &const_node : g.nodes(NodeType::Const))
+    for (auto &const_node : g.nodes(NodeType::Const))
     {
-        if(const_node != EmptyNodeID)
+        if (const_node != EmptyNodeID)
         {
             visited[const_node] = true;
             stack.push(const_node);
@@ -155,7 +155,7 @@
     }
 
     // Iterate over vector and edges
-    while(!stack.empty())
+    while (!stack.empty())
     {
         // Pop a node from stack and process
         NodeID n = stack.top();
@@ -163,7 +163,7 @@
         stack.pop();
 
         // Mark node as visited
-        if(!visited[n])
+        if (!visited[n])
         {
             visited[n] = true;
         }
@@ -171,11 +171,11 @@
         const INode *node = g.node(n);
         ARM_COMPUTE_ERROR_ON(node == nullptr);
         // Reverse iterate to push branches from right to left and pop on the opposite order
-        for(const auto &eid : arm_compute::utils::iterable::reverse_iterate(node->output_edges()))
+        for (const auto &eid : arm_compute::utils::iterable::reverse_iterate(node->output_edges()))
         {
             const Edge *e = g.edge(eid);
             ARM_COMPUTE_ERROR_ON(e == nullptr);
-            if(!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
+            if (!visited[e->consumer_id()] && detail::all_inputs_are_visited(e->consumer(), visited))
             {
                 stack.push(e->consumer_id());
             }

diff --git a/src/graph/backends/BackendRegistry.cpp b/src/graph/backends/BackendRegistry.cpp
index 46b4f99..bb6af79 100644
--- a/src/graph/backends/BackendRegistry.cpp
+++ b/src/graph/backends/BackendRegistry.cpp

@@ -31,8 +31,7 @@
 {
 namespace backends
 {
-BackendRegistry::BackendRegistry()
-    : _registered_backends()
+BackendRegistry::BackendRegistry() : _registered_backends()
 {
 }
 

diff --git a/src/graph/backends/CL/CLDeviceBackend.cpp b/src/graph/backends/CL/CLDeviceBackend.cpp
index 01e5ab1..e27a410 100644
--- a/src/graph/backends/CL/CLDeviceBackend.cpp
+++ b/src/graph/backends/CL/CLDeviceBackend.cpp

@@ -23,18 +23,17 @@
  */
 #include "arm_compute/graph/backends/CL/CLDeviceBackend.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/graph/backends/BackendRegistrar.h"
 #include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
 #include "arm_compute/graph/backends/CL/CLNodeValidator.h"
 #include "arm_compute/graph/backends/CL/CLSubTensorHandle.h"
 #include "arm_compute/graph/backends/CL/CLTensorHandle.h"
-
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/CL/CLBufferAllocator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -64,7 +63,12 @@
 static detail::BackendRegistrar<CLDeviceBackend> CLDeviceBackend_registrar(Target::CL);
 
 CLDeviceBackend::CLDeviceBackend()
-    : _context_count(0), _tuner(), _gemm_heuristics(), _allocator(nullptr), _tuner_file(), _backend_type(CLBackendType::Native)
+    : _context_count(0),
+      _tuner(),
+      _gemm_heuristics(),
+      _allocator(nullptr),
+      _tuner_file(),
+      _backend_type(CLBackendType::Native)
 {
 }
 
@@ -95,7 +99,7 @@
 {
     ARM_COMPUTE_UNUSED(ctx);
     _context_count--;
-    if(_context_count == 0) // No more context using the backend: free resources
+    if (_context_count == 0) // No more context using the backend: free resources
     {
         _allocator = nullptr;
     }
@@ -105,7 +109,7 @@
 {
     // Force backend initialization
     _context_count++;
-    if(_context_count == 1)
+    if (_context_count == 1)
     {
         _backend_type = ctx.config().backend_type;
         initialize_backend();
@@ -115,7 +119,7 @@
     _tuner_file = ctx.config().tuner_file;
 
     // Load tuner data if available
-    if(file_exists(_tuner_file))
+    if (file_exists(_tuner_file))
     {
         _tuner.load_from_file(_tuner_file);
     }
@@ -128,7 +132,7 @@
     CLScheduler::get().gemm_heuristics()->reload_from_file(ctx.config().mlgo_file);
 
     // Setup a management backend
-    if(ctx.memory_management_ctx(Target::CL) == nullptr)
+    if (ctx.memory_management_ctx(Target::CL) == nullptr)
     {
         MemoryManagerContext mm_ctx;
         mm_ctx.target      = Target::CL;
@@ -141,7 +145,7 @@
     }
 
     // Create function level weights manager
-    if(ctx.weights_management_ctx(Target::CL) == nullptr)
+    if (ctx.weights_management_ctx(Target::CL) == nullptr)
     {
         WeightsManagerContext wm_ctx;
         wm_ctx.target = Target::CL;
@@ -174,9 +178,10 @@
     return std::make_unique<CLTensorHandle>(info);
 }
 
-std::unique_ptr<ITensorHandle> CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+std::unique_ptr<ITensorHandle>
+CLDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
 {
-    if(parent == nullptr)
+    if (parent == nullptr)
     {
         return nullptr;
     }
@@ -203,7 +208,7 @@
 
 std::shared_ptr<arm_compute::IMemoryManager> CLDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
 {
-    if(affinity == MemoryManagerAffinity::Offset)
+    if (affinity == MemoryManagerAffinity::Offset)
     {
         ARM_COMPUTE_LOG_GRAPH_WARNING("CL Backend does not support offset affinity memory management!");
         return nullptr;

diff --git a/src/graph/backends/CL/CLFunctionsFactory.cpp b/src/graph/backends/CL/CLFunctionsFactory.cpp
index 8828104..d4e1aa8 100644
--- a/src/graph/backends/CL/CLFunctionsFactory.cpp
+++ b/src/graph/backends/CL/CLFunctionsFactory.cpp

@@ -22,12 +22,12 @@
  * SOFTWARE.
  */
 #include "arm_compute/graph/backends/CL/CLFunctionFactory.h"
-
+#include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/backends/FunctionHelpers.h"
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
+
 #include "src/core/CL/CLKernels.h"
 #include "support/Cast.h"
 
@@ -89,20 +89,19 @@
 {
 public:
     /* Default constructor */
-    CPPWrapperFunction()
-        : _tensors(), _func(nullptr)
+    CPPWrapperFunction() : _tensors(), _func(nullptr)
     {
     }
 
     void run() override
     {
-        for(auto &tensor : _tensors)
+        for (auto &tensor : _tensors)
         {
             tensor->map(CLScheduler::get().queue());
         }
         _func->run();
 
-        for(auto &tensor : _tensors)
+        for (auto &tensor : _tensors)
         {
             tensor->unmap(CLScheduler::get().queue());
         }
@@ -127,7 +126,8 @@
 {
 // Specialized functions
 template <>
-std::unique_ptr<IFunction> create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(DetectionOutputLayerNode &node)
+std::unique_ptr<IFunction>
+create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(DetectionOutputLayerNode &node)
 {
     validate_node<CLTargetInfo>(node, 3 /* expected inputs */, 1 /* expected outputs */);
 
@@ -149,16 +149,12 @@
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << CLTargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << CLTargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output shape: " << output->info()->tensor_shape()
-                               << " DetectionOutputLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionOutputLayer info: " << detect_info << std::endl);
 
     auto wrap_function = std::make_unique<CPPWrapperFunction>();
 
@@ -171,7 +167,8 @@
     return std::move(wrap_function);
 }
 template <>
-std::unique_ptr<IFunction> create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(DetectionPostProcessLayerNode &node)
+std::unique_ptr<IFunction>
+create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(DetectionPostProcessLayerNode &node)
 {
     validate_node<CLTargetInfo>(node, 3 /* expected inputs */, 4 /* expected outputs */);
 
@@ -199,19 +196,15 @@
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << CLTargetInfo::TargetType
-                               << " Data Type: " << input0->info()->data_type()
-                               << " Input0 shape: " << input0->info()->tensor_shape()
-                               << " Input1 shape: " << input1->info()->tensor_shape()
+                               << node.name() << " Type: " << node.type() << " Target: " << CLTargetInfo::TargetType
+                               << " Data Type: " << input0->info()->data_type() << " Input0 shape: "
+                               << input0->info()->tensor_shape() << " Input1 shape: " << input1->info()->tensor_shape()
                                << " Input2 shape: " << input2->info()->tensor_shape()
                                << " Output0 shape: " << output0->info()->tensor_shape()
                                << " Output1 shape: " << output1->info()->tensor_shape()
                                << " Output2 shape: " << output2->info()->tensor_shape()
                                << " Output3 shape: " << output3->info()->tensor_shape()
-                               << " DetectionPostProcessLayer info: " << detect_info
-                               << std::endl);
+                               << " DetectionPostProcessLayer info: " << detect_info << std::endl);
 
     auto wrap_function = std::make_unique<CPPWrapperFunction>();
 
@@ -230,92 +223,128 @@
 
 std::unique_ptr<IFunction> CLFunctionFactory::create(INode *node, GraphContext &ctx)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return nullptr;
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ActivationLayer:
-            return detail::create_activation_layer<CLActivationLayer, CLTargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
+            return detail::create_activation_layer<CLActivationLayer, CLTargetInfo>(
+                *polymorphic_downcast<ActivationLayerNode *>(node));
         case NodeType::ArgMinMaxLayer:
-            return detail::create_arg_min_max_layer<CLArgMinMaxLayer, CLTargetInfo>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::create_arg_min_max_layer<CLArgMinMaxLayer, CLTargetInfo>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
-            return detail::create_batch_normalization_layer<CLBatchNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+            return detail::create_batch_normalization_layer<CLBatchNormalizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<BatchNormalizationLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
-            return detail::create_bounding_box_transform_layer<CLBoundingBoxTransform, CLTargetInfo>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
+            return detail::create_bounding_box_transform_layer<CLBoundingBoxTransform, CLTargetInfo>(
+                *polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
-            return detail::create_channel_shuffle_layer<CLChannelShuffleLayer, CLTargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::create_channel_shuffle_layer<CLChannelShuffleLayer, CLTargetInfo>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::create_convolution_layer<CLConvolutionLayerFunctions, CLTargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+            return detail::create_convolution_layer<CLConvolutionLayerFunctions, CLTargetInfo>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::DeconvolutionLayer:
-            return detail::create_deconvolution_layer<CLDeconvolutionLayer, CLTargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
+            return detail::create_deconvolution_layer<CLDeconvolutionLayer, CLTargetInfo>(
+                *polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
-            return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
+            return detail::create_concatenate_layer<CLConcatenateLayer, CLTargetInfo>(
+                *polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthToSpaceLayer:
-            return detail::create_depth_to_space_layer<CLDepthToSpaceLayer, CLTargetInfo>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::create_depth_to_space_layer<CLDepthToSpaceLayer, CLTargetInfo>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayer, CLTargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::create_depthwise_convolution_layer<CLDepthwiseConvolutionLayer, CLTargetInfo>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::create_dequantization_layer<CLDequantizationLayer, CLTargetInfo>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::create_dequantization_layer<CLDequantizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, CLTargetInfo>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::create_detection_post_process_layer<CPPDetectionPostProcessLayer, CLTargetInfo>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::create_eltwise_layer<CLEltwiseFunctions, CLTargetInfo>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::create_unary_eltwise_layer<CLUnaryEltwiseFunctions, CLTargetInfo>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::create_unary_eltwise_layer<CLUnaryEltwiseFunctions, CLTargetInfo>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
-            return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
+            return detail::create_flatten_layer<CLFlattenLayer, CLTargetInfo>(
+                *polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:
-            return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+            return detail::create_fully_connected_layer<CLFullyConnectedLayer, CLTargetInfo>(
+                *polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
-            return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(
+                *polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
-            return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes, CLTargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_depthwise_convolution_batch_normalization_layer<CLFusedLayerTypes,
+                                                                                        CLTargetInfo>(
+                *polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::GenerateProposalsLayer:
-            return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
+            return detail::create_generate_proposals_layer<CLGenerateProposalsLayer, CLTargetInfo>(
+                *polymorphic_downcast<GenerateProposalsLayerNode *>(node), ctx);
         case NodeType::L2NormalizeLayer:
-            return detail::create_l2_normalize_layer<CLL2NormalizeLayer, CLTargetInfo>(*polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
+            return detail::create_l2_normalize_layer<CLL2NormalizeLayer, CLTargetInfo>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
-            return detail::create_normalization_layer<CLNormalizationLayer, CLTargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+            return detail::create_normalization_layer<CLNormalizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::NormalizePlanarYUVLayer:
-            return detail::create_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer, CLTargetInfo>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+            return detail::create_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer, CLTargetInfo>(
+                *polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
         case NodeType::PadLayer:
             return detail::create_pad_layer<CLPadLayer, CLTargetInfo>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
-            return detail::create_permute_layer<CLPermute, CLTargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
+            return detail::create_permute_layer<CLPermute, CLTargetInfo>(
+                *polymorphic_downcast<PermuteLayerNode *>(node));
         case NodeType::PoolingLayer:
-            return detail::create_pooling_layer<CLPoolingLayer, CLTargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+            return detail::create_pooling_layer<CLPoolingLayer, CLTargetInfo>(
+                *polymorphic_downcast<PoolingLayerNode *>(node));
         case NodeType::PReluLayer:
-            return detail::create_prelu_layer<CLPReluLayer, CLTargetInfo>(*polymorphic_downcast<PReluLayerNode *>(node));
+            return detail::create_prelu_layer<CLPReluLayer, CLTargetInfo>(
+                *polymorphic_downcast<PReluLayerNode *>(node));
         case NodeType::PrintLayer:
             return detail::create_print_layer<CLTargetInfo>(*polymorphic_downcast<PrintLayerNode *>(node));
         case NodeType::PriorBoxLayer:
-            return detail::create_priorbox_layer<CLPriorBoxLayer, CLTargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+            return detail::create_priorbox_layer<CLPriorBoxLayer, CLTargetInfo>(
+                *polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::create_quantization_layer<CLQuantizationLayer, CLTargetInfo>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::create_quantization_layer<CLQuantizationLayer, CLTargetInfo>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::create_reduction_operation_layer<CLReductionOperation, CLTargetInfo>(*polymorphic_downcast<ReductionLayerNode *>(node), ctx);
+            return detail::create_reduction_operation_layer<CLReductionOperation, CLTargetInfo>(
+                *polymorphic_downcast<ReductionLayerNode *>(node), ctx);
         case NodeType::ReorgLayer:
-            return detail::create_reorg_layer<CLReorgLayer, CLTargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
+            return detail::create_reorg_layer<CLReorgLayer, CLTargetInfo>(
+                *polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
-            return detail::create_reshape_layer<CLReshapeLayer, CLTargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
+            return detail::create_reshape_layer<CLReshapeLayer, CLTargetInfo>(
+                *polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ResizeLayer:
             return detail::create_resize_layer<CLScale, CLTargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
         case NodeType::ROIAlignLayer:
-            return detail::create_roi_align_layer<CLROIAlignLayer, CLTargetInfo>(*polymorphic_downcast<ROIAlignLayerNode *>(node));
+            return detail::create_roi_align_layer<CLROIAlignLayer, CLTargetInfo>(
+                *polymorphic_downcast<ROIAlignLayerNode *>(node));
         case NodeType::SliceLayer:
             return detail::create_slice_layer<CLSlice, CLTargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::SoftmaxLayer:
-            return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+            return detail::create_softmax_layer<CLSoftmaxLayer, CLTargetInfo>(
+                *polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
         case NodeType::StackLayer:
-            return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
+            return detail::create_stack_layer<CLStackLayer, CLTargetInfo>(
+                *polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::create_strided_slice_layer<CLStridedSlice, CLTargetInfo>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         default:
             return nullptr;
     }

diff --git a/src/graph/backends/CL/CLNodeValidator.cpp b/src/graph/backends/CL/CLNodeValidator.cpp
index 8fd8c14..510eda7 100644
--- a/src/graph/backends/CL/CLNodeValidator.cpp
+++ b/src/graph/backends/CL/CLNodeValidator.cpp

@@ -25,7 +25,6 @@
 
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
-
 #include "arm_compute/runtime/CL/CLFunctions.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 
@@ -57,41 +56,51 @@
 
 Status CLNodeValidator::validate(INode *node)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return Status{};
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ArgMinMaxLayer:
-            return detail::validate_arg_min_max_layer<CLArgMinMaxLayer>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::validate_arg_min_max_layer<CLArgMinMaxLayer>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
-            return detail::validate_bounding_box_transform_layer<CLBoundingBoxTransform>(*polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
+            return detail::validate_bounding_box_transform_layer<CLBoundingBoxTransform>(
+                *polymorphic_downcast<BoundingBoxTransformLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
-            return detail::validate_channel_shuffle_layer<CLChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::validate_channel_shuffle_layer<CLChannelShuffleLayer>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::validate_convolution_layer<CLConvolutionLayer,
-                   CLDirectConvolutionLayer,
-                   CLGEMMConvolutionLayer,
-                   CLWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+            return detail::validate_convolution_layer<CLConvolutionLayer, CLDirectConvolutionLayer,
+                                                      CLGEMMConvolutionLayer, CLWinogradConvolutionLayer>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node));
         case NodeType::DepthToSpaceLayer:
-            return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::validate_depth_to_space_layer<CLDepthToSpaceLayer>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::validate_depthwise_convolution_layer<CLDepthwiseConvolutionLayer>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::validate_dequantization_layer<CLDequantizationLayer>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::validate_dequantization_layer<CLDequantizationLayer>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::validate_detection_post_process_layer<CPPDetectionPostProcessLayer>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::validate_detection_post_process_layer<CPPDetectionPostProcessLayer>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
-            return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(*polymorphic_downcast<GenerateProposalsLayerNode *>(node));
+            return detail::validate_generate_proposals_layer<CLGenerateProposalsLayer>(
+                *polymorphic_downcast<GenerateProposalsLayerNode *>(node));
         case NodeType::L2NormalizeLayer:
-            return detail::validate_l2_normalize_layer<CLL2NormalizeLayer>(*polymorphic_downcast<L2NormalizeLayerNode *>(node));
+            return detail::validate_l2_normalize_layer<CLL2NormalizeLayer>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:
-            return detail::validate_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer>(*polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
+            return detail::validate_normalize_planar_yuv_layer<CLNormalizePlanarYUVLayer>(
+                *polymorphic_downcast<NormalizePlanarYUVLayerNode *>(node));
         case NodeType::PadLayer:
             return detail::validate_pad_layer<CLPadLayer>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
@@ -101,9 +110,11 @@
         case NodeType::PriorBoxLayer:
             return detail::validate_priorbox_layer<CLPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::validate_quantization_layer<CLQuantizationLayer>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::validate_quantization_layer<CLQuantizationLayer>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::validate_reduction_operation_layer<CLReductionOperation>(*polymorphic_downcast<ReductionLayerNode *>(node));
+            return detail::validate_reduction_operation_layer<CLReductionOperation>(
+                *polymorphic_downcast<ReductionLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<CLReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
@@ -113,11 +124,14 @@
         case NodeType::SliceLayer:
             return detail::validate_slice_layer<CLSlice>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::validate_strided_slice_layer<CLStridedSlice>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::validate_strided_slice_layer<CLStridedSlice>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::validate_eltwise_Layer<CLEltwiseLayerFunctions>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::validate_eltwise_Layer<CLEltwiseLayerFunctions>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::validate_unary_eltwise_layer<CLUnaryEltwiseLayerFunctions>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::validate_unary_eltwise_layer<CLUnaryEltwiseLayerFunctions>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         default:
             return Status{};
     }

diff --git a/src/graph/backends/CL/CLSubTensorHandle.cpp b/src/graph/backends/CL/CLSubTensorHandle.cpp
index b97d258..ccdc877 100644
--- a/src/graph/backends/CL/CLSubTensorHandle.cpp
+++ b/src/graph/backends/CL/CLSubTensorHandle.cpp

@@ -31,7 +31,10 @@
 {
 namespace backends
 {
-CLSubTensorHandle::CLSubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+CLSubTensorHandle::CLSubTensorHandle(ITensorHandle     *parent_handle,
+                                     const TensorShape &shape,
+                                     const Coordinates &coords,
+                                     bool               extend_parent)
     : _sub_tensor(), _parent_handle(nullptr)
 {
     ARM_COMPUTE_ERROR_ON(!parent_handle);
@@ -98,4 +101,4 @@
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/backends/CL/CLTensorHandle.cpp b/src/graph/backends/CL/CLTensorHandle.cpp
index a496c2c..1b69f9d 100644
--- a/src/graph/backends/CL/CLTensorHandle.cpp
+++ b/src/graph/backends/CL/CLTensorHandle.cpp

@@ -31,8 +31,7 @@
 {
 namespace backends
 {
-CLTensorHandle::CLTensorHandle(const ITensorInfo &info)
-    : _tensor()
+CLTensorHandle::CLTensorHandle(const ITensorInfo &info) : _tensor()
 {
     _tensor.allocator()->init(info);
 }
@@ -49,7 +48,7 @@
 
 void CLTensorHandle::manage(IMemoryGroup *mg)
 {
-    if(mg != nullptr)
+    if (mg != nullptr)
     {
         mg->manage(&_tensor);
     }
@@ -68,7 +67,7 @@
 void CLTensorHandle::release_if_unused()
 {
     // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
-    if(!_tensor.is_used())
+    if (!_tensor.is_used())
     {
         _tensor.allocator()->free();
     }
@@ -100,4 +99,4 @@
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/backends/NEON/NEDeviceBackend.cpp b/src/graph/backends/NEON/NEDeviceBackend.cpp
index 1845653..fc7b309 100644
--- a/src/graph/backends/NEON/NEDeviceBackend.cpp
+++ b/src/graph/backends/NEON/NEDeviceBackend.cpp

@@ -23,18 +23,17 @@
  */
 #include "arm_compute/graph/backends/NEON/NEDeviceBackend.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/GraphContext.h"
-#include "arm_compute/graph/INode.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Tensor.h"
+#include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/graph/backends/BackendRegistrar.h"
 #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
 #include "arm_compute/graph/backends/NEON/NENodeValidator.h"
 #include "arm_compute/graph/backends/NEON/NESubTensorHandle.h"
 #include "arm_compute/graph/backends/NEON/NETensorHandle.h"
-
-#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/GraphContext.h"
+#include "arm_compute/graph/INode.h"
+#include "arm_compute/graph/Logger.h"
+#include "arm_compute/graph/Tensor.h"
 #include "arm_compute/runtime/Allocator.h"
 #include "arm_compute/runtime/BlobLifetimeManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
@@ -53,8 +52,7 @@
 /** Register CPU backend */
 static detail::BackendRegistrar<NEDeviceBackend> NEDeviceBackend_registrar(Target::NEON);
 
-NEDeviceBackend::NEDeviceBackend()
-    : _allocator()
+NEDeviceBackend::NEDeviceBackend() : _allocator()
 {
 }
 
@@ -72,13 +70,13 @@
 void NEDeviceBackend::setup_backend_context(GraphContext &ctx)
 {
     // Set number of threads
-    if(ctx.config().num_threads >= 0)
+    if (ctx.config().num_threads >= 0)
     {
         Scheduler::get().set_num_threads(ctx.config().num_threads);
     }
 
     // Create function level memory manager
-    if(ctx.memory_management_ctx(Target::NEON) == nullptr)
+    if (ctx.memory_management_ctx(Target::NEON) == nullptr)
     {
         MemoryManagerContext mm_ctx;
         mm_ctx.target      = Target::NEON;
@@ -91,7 +89,7 @@
     }
 
     // Create function level weights manager
-    if(ctx.weights_management_ctx(Target::NEON) == nullptr)
+    if (ctx.weights_management_ctx(Target::NEON) == nullptr)
     {
         WeightsManagerContext wm_ctx;
         wm_ctx.target = Target::NEON;
@@ -124,9 +122,10 @@
     return std::make_unique<NETensorHandle>(info);
 }
 
-std::unique_ptr<ITensorHandle> NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
+std::unique_ptr<ITensorHandle>
+NEDeviceBackend::create_subtensor(ITensorHandle *parent, TensorShape shape, Coordinates coords, bool extend_parent)
 {
-    if(parent == nullptr)
+    if (parent == nullptr)
     {
         return nullptr;
     }
@@ -154,7 +153,7 @@
 std::shared_ptr<arm_compute::IMemoryManager> NEDeviceBackend::create_memory_manager(MemoryManagerAffinity affinity)
 {
     std::shared_ptr<ILifetimeManager> lifetime_mgr = nullptr;
-    if(affinity == MemoryManagerAffinity::Buffer)
+    if (affinity == MemoryManagerAffinity::Buffer)
     {
         lifetime_mgr = std::make_shared<BlobLifetimeManager>();
     }

diff --git a/src/graph/backends/NEON/NEFunctionFactory.cpp b/src/graph/backends/NEON/NEFunctionFactory.cpp
index d7ed5f9..fe15d4c 100644
--- a/src/graph/backends/NEON/NEFunctionFactory.cpp
+++ b/src/graph/backends/NEON/NEFunctionFactory.cpp

@@ -23,13 +23,13 @@
  */
 #include "arm_compute/graph/backends/NEON/NEFunctionFactory.h"
 
+#include "arm_compute/graph/backends/FunctionHelpers.h"
+#include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/TypePrinter.h"
-#include "arm_compute/graph/backends/FunctionHelpers.h"
-#include "arm_compute/graph/backends/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/TypePrinter.h"
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
 
@@ -88,7 +88,8 @@
 namespace detail
 {
 template <>
-std::unique_ptr<IFunction> create_normalization_layer<NENormalizationLayer, NETargetInfo>(NormalizationLayerNode &node, GraphContext &ctx)
+std::unique_ptr<IFunction> create_normalization_layer<NENormalizationLayer, NETargetInfo>(NormalizationLayerNode &node,
+                                                                                          GraphContext           &ctx)
 {
     validate_node<NETargetInfo>(node, 1 /* expected inputs */, 1 /* expected outputs */);
 
@@ -105,14 +106,10 @@
 
     // Log info
     ARM_COMPUTE_LOG_GRAPH_INFO("Instantiated "
-                               << node.name()
-                               << " Type: " << node.type()
-                               << " Target: " << NETargetInfo::TargetType
-                               << " Data Type: " << input->info()->data_type()
-                               << " Input shape: " << input->info()->tensor_shape()
-                               << " Output shape: " << output->info()->tensor_shape()
-                               << " Normalization info: " << norm_info.type()
-                               << std::endl);
+                               << node.name() << " Type: " << node.type() << " Target: " << NETargetInfo::TargetType
+                               << " Data Type: " << input->info()->data_type() << " Input shape: "
+                               << input->info()->tensor_shape() << " Output shape: " << output->info()->tensor_shape()
+                               << " Normalization info: " << norm_info.type() << std::endl);
 
     return func;
 }
@@ -120,84 +117,116 @@
 
 std::unique_ptr<IFunction> NEFunctionFactory::create(INode *node, GraphContext &ctx)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return nullptr;
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ActivationLayer:
-            return detail::create_activation_layer<NEActivationLayer, NETargetInfo>(*polymorphic_downcast<ActivationLayerNode *>(node));
+            return detail::create_activation_layer<NEActivationLayer, NETargetInfo>(
+                *polymorphic_downcast<ActivationLayerNode *>(node));
         case NodeType::ArgMinMaxLayer:
-            return detail::create_arg_min_max_layer<NEArgMinMaxLayer, NETargetInfo>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::create_arg_min_max_layer<NEArgMinMaxLayer, NETargetInfo>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BatchNormalizationLayer:
-            return detail::create_batch_normalization_layer<NEBatchNormalizationLayer, NETargetInfo>(*polymorphic_downcast<BatchNormalizationLayerNode *>(node));
+            return detail::create_batch_normalization_layer<NEBatchNormalizationLayer, NETargetInfo>(
+                *polymorphic_downcast<BatchNormalizationLayerNode *>(node));
         case NodeType::ChannelShuffleLayer:
-            return detail::create_channel_shuffle_layer<NEChannelShuffleLayer, NETargetInfo>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::create_channel_shuffle_layer<NEChannelShuffleLayer, NETargetInfo>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(*polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
+            return detail::create_convolution_layer<NEConvolutionLayerFunctions, NETargetInfo>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node), ctx);
         case NodeType::DepthToSpaceLayer:
-            return detail::create_depth_to_space_layer<NEDepthToSpaceLayer, NETargetInfo>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::create_depth_to_space_layer<NEDepthToSpaceLayer, NETargetInfo>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DeconvolutionLayer:
-            return detail::create_deconvolution_layer<NEDeconvolutionLayer, NETargetInfo>(*polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
+            return detail::create_deconvolution_layer<NEDeconvolutionLayer, NETargetInfo>(
+                *polymorphic_downcast<DeconvolutionLayerNode *>(node), ctx);
         case NodeType::ConcatenateLayer:
-            return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(*polymorphic_downcast<ConcatenateLayerNode *>(node));
+            return detail::create_concatenate_layer<NEConcatenateLayer, NETargetInfo>(
+                *polymorphic_downcast<ConcatenateLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayer, NETargetInfo>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::create_depthwise_convolution_layer<NEDepthwiseConvolutionLayer, NETargetInfo>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::create_dequantization_layer<NEDequantizationLayer, NETargetInfo>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::create_dequantization_layer<NEDequantizationLayer, NETargetInfo>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::create_detection_output_layer<CPPDetectionOutputLayer, NETargetInfo>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::create_detection_output_layer<CPPDetectionOutputLayer, NETargetInfo>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::create_detection_post_process_layer<NEDetectionPostProcessLayer, NETargetInfo>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::create_detection_post_process_layer<NEDetectionPostProcessLayer, NETargetInfo>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::create_eltwise_layer<NEEltwiseFunctions, NETargetInfo>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::create_unary_eltwise_layer<NEUnaryEltwiseFunctions, NETargetInfo>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::create_unary_eltwise_layer<NEUnaryEltwiseFunctions, NETargetInfo>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         case NodeType::FlattenLayer:
-            return detail::create_flatten_layer<NEFlattenLayer, NETargetInfo>(*polymorphic_downcast<FlattenLayerNode *>(node));
+            return detail::create_flatten_layer<NEFlattenLayer, NETargetInfo>(
+                *polymorphic_downcast<FlattenLayerNode *>(node));
         case NodeType::FullyConnectedLayer:
-            return detail::create_fully_connected_layer<NEFullyConnectedLayer, NETargetInfo>(*polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
+            return detail::create_fully_connected_layer<NEFullyConnectedLayer, NETargetInfo>(
+                *polymorphic_downcast<FullyConnectedLayerNode *>(node), ctx);
         case NodeType::FusedConvolutionBatchNormalizationLayer:
-            return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(
+                *polymorphic_downcast<FusedConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer:
-            return detail::create_fused_depthwise_convolution_batch_normalization_layer<NEFusedLayerTypes, NETargetInfo>(*polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
+            return detail::create_fused_depthwise_convolution_batch_normalization_layer<NEFusedLayerTypes,
+                                                                                        NETargetInfo>(
+                *polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node), ctx);
         case NodeType::L2NormalizeLayer:
-            return detail::create_l2_normalize_layer<NEL2NormalizeLayer, NETargetInfo>(*polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
+            return detail::create_l2_normalize_layer<NEL2NormalizeLayer, NETargetInfo>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node), ctx);
         case NodeType::NormalizationLayer:
-            return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(*polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
+            return detail::create_normalization_layer<NENormalizationLayer, NETargetInfo>(
+                *polymorphic_downcast<NormalizationLayerNode *>(node), ctx);
         case NodeType::PadLayer:
             return detail::create_pad_layer<NEPadLayer, NETargetInfo>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
-            return detail::create_permute_layer<NEPermute, NETargetInfo>(*polymorphic_downcast<PermuteLayerNode *>(node));
+            return detail::create_permute_layer<NEPermute, NETargetInfo>(
+                *polymorphic_downcast<PermuteLayerNode *>(node));
         case NodeType::PoolingLayer:
-            return detail::create_pooling_layer<NEPoolingLayer, NETargetInfo>(*polymorphic_downcast<PoolingLayerNode *>(node));
+            return detail::create_pooling_layer<NEPoolingLayer, NETargetInfo>(
+                *polymorphic_downcast<PoolingLayerNode *>(node));
         case NodeType::PReluLayer:
-            return detail::create_prelu_layer<NEPReluLayer, NETargetInfo>(*polymorphic_downcast<PReluLayerNode *>(node));
+            return detail::create_prelu_layer<NEPReluLayer, NETargetInfo>(
+                *polymorphic_downcast<PReluLayerNode *>(node));
         case NodeType::PrintLayer:
             return detail::create_print_layer<NETargetInfo>(*polymorphic_downcast<PrintLayerNode *>(node));
         case NodeType::PriorBoxLayer:
-            return detail::create_priorbox_layer<NEPriorBoxLayer, NETargetInfo>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
+            return detail::create_priorbox_layer<NEPriorBoxLayer, NETargetInfo>(
+                *polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::create_quantization_layer<NEQuantizationLayer, NETargetInfo>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::create_quantization_layer<NEQuantizationLayer, NETargetInfo>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::create_reduction_operation_layer<NEReductionOperation, NETargetInfo>(*polymorphic_downcast<ReductionLayerNode *>(node), ctx);
+            return detail::create_reduction_operation_layer<NEReductionOperation, NETargetInfo>(
+                *polymorphic_downcast<ReductionLayerNode *>(node), ctx);
         case NodeType::ReorgLayer:
-            return detail::create_reorg_layer<NEReorgLayer, NETargetInfo>(*polymorphic_downcast<ReorgLayerNode *>(node));
+            return detail::create_reorg_layer<NEReorgLayer, NETargetInfo>(
+                *polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
-            return detail::create_reshape_layer<NEReshapeLayer, NETargetInfo>(*polymorphic_downcast<ReshapeLayerNode *>(node));
+            return detail::create_reshape_layer<NEReshapeLayer, NETargetInfo>(
+                *polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ResizeLayer:
             return detail::create_resize_layer<NEScale, NETargetInfo>(*polymorphic_downcast<ResizeLayerNode *>(node));
         case NodeType::SliceLayer:
             return detail::create_slice_layer<NESlice, NETargetInfo>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::SoftmaxLayer:
-            return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(*polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
+            return detail::create_softmax_layer<NESoftmaxLayer, NETargetInfo>(
+                *polymorphic_downcast<SoftmaxLayerNode *>(node), ctx);
         case NodeType::StackLayer:
-            return detail::create_stack_layer<NEStackLayer, NETargetInfo>(*polymorphic_downcast<StackLayerNode *>(node));
+            return detail::create_stack_layer<NEStackLayer, NETargetInfo>(
+                *polymorphic_downcast<StackLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::create_strided_slice_layer<NEStridedSlice, NETargetInfo>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::create_strided_slice_layer<NEStridedSlice, NETargetInfo>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         default:
             return nullptr;
     }

diff --git a/src/graph/backends/NEON/NENodeValidator.cpp b/src/graph/backends/NEON/NENodeValidator.cpp
index a485e5d..a97806f 100644
--- a/src/graph/backends/NEON/NENodeValidator.cpp
+++ b/src/graph/backends/NEON/NENodeValidator.cpp

@@ -25,9 +25,9 @@
 
 #include "arm_compute/graph/backends/ValidateHelpers.h"
 #include "arm_compute/graph/nodes/Nodes.h"
-
 #include "arm_compute/runtime/CPP/CPPFunctions.h"
 #include "arm_compute/runtime/NEON/NEFunctions.h"
+
 #include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
@@ -56,41 +56,51 @@
 
 Status NENodeValidator::validate(INode *node)
 {
-    if(node == nullptr)
+    if (node == nullptr)
     {
         return Status{};
     }
 
     NodeType type = node->type();
-    switch(type)
+    switch (type)
     {
         case NodeType::ArgMinMaxLayer:
-            return detail::validate_arg_min_max_layer<NEArgMinMaxLayer>(*polymorphic_downcast<ArgMinMaxLayerNode *>(node));
+            return detail::validate_arg_min_max_layer<NEArgMinMaxLayer>(
+                *polymorphic_downcast<ArgMinMaxLayerNode *>(node));
         case NodeType::BoundingBoxTransformLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : BoundingBoxTransformLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : BoundingBoxTransformLayer");
         case NodeType::ChannelShuffleLayer:
-            return detail::validate_channel_shuffle_layer<NEChannelShuffleLayer>(*polymorphic_downcast<ChannelShuffleLayerNode *>(node));
+            return detail::validate_channel_shuffle_layer<NEChannelShuffleLayer>(
+                *polymorphic_downcast<ChannelShuffleLayerNode *>(node));
         case NodeType::ConvolutionLayer:
-            return detail::validate_convolution_layer<NEConvolutionLayer,
-                   NEDirectConvolutionLayer,
-                   NEGEMMConvolutionLayer,
-                   NEWinogradConvolutionLayer>(*polymorphic_downcast<ConvolutionLayerNode *>(node));
+            return detail::validate_convolution_layer<NEConvolutionLayer, NEDirectConvolutionLayer,
+                                                      NEGEMMConvolutionLayer, NEWinogradConvolutionLayer>(
+                *polymorphic_downcast<ConvolutionLayerNode *>(node));
         case NodeType::DepthToSpaceLayer:
-            return detail::validate_depth_to_space_layer<NEDepthToSpaceLayer>(*polymorphic_downcast<DepthToSpaceLayerNode *>(node));
+            return detail::validate_depth_to_space_layer<NEDepthToSpaceLayer>(
+                *polymorphic_downcast<DepthToSpaceLayerNode *>(node));
         case NodeType::DepthwiseConvolutionLayer:
-            return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer>(*polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
+            return detail::validate_depthwise_convolution_layer<NEDepthwiseConvolutionLayer>(
+                *polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node));
         case NodeType::DequantizationLayer:
-            return detail::validate_dequantization_layer<NEDequantizationLayer>(*polymorphic_downcast<DequantizationLayerNode *>(node));
+            return detail::validate_dequantization_layer<NEDequantizationLayer>(
+                *polymorphic_downcast<DequantizationLayerNode *>(node));
         case NodeType::DetectionOutputLayer:
-            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(*polymorphic_downcast<DetectionOutputLayerNode *>(node));
+            return detail::validate_detection_output_layer<CPPDetectionOutputLayer>(
+                *polymorphic_downcast<DetectionOutputLayerNode *>(node));
         case NodeType::DetectionPostProcessLayer:
-            return detail::validate_detection_post_process_layer<NEDetectionPostProcessLayer>(*polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
+            return detail::validate_detection_post_process_layer<NEDetectionPostProcessLayer>(
+                *polymorphic_downcast<DetectionPostProcessLayerNode *>(node));
         case NodeType::GenerateProposalsLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : GenerateProposalsLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : GenerateProposalsLayer");
         case NodeType::L2NormalizeLayer:
-            return detail::validate_l2_normalize_layer<NEL2NormalizeLayer>(*polymorphic_downcast<L2NormalizeLayerNode *>(node));
+            return detail::validate_l2_normalize_layer<NEL2NormalizeLayer>(
+                *polymorphic_downcast<L2NormalizeLayerNode *>(node));
         case NodeType::NormalizePlanarYUVLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : NormalizePlanarYUVLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : NormalizePlanarYUVLayer");
         case NodeType::PadLayer:
             return detail::validate_pad_layer<NEPadLayer>(*polymorphic_downcast<PadLayerNode *>(node));
         case NodeType::PermuteLayer:
@@ -100,23 +110,29 @@
         case NodeType::PriorBoxLayer:
             return detail::validate_priorbox_layer<NEPriorBoxLayer>(*polymorphic_downcast<PriorBoxLayerNode *>(node));
         case NodeType::QuantizationLayer:
-            return detail::validate_quantization_layer<NEQuantizationLayer>(*polymorphic_downcast<QuantizationLayerNode *>(node));
+            return detail::validate_quantization_layer<NEQuantizationLayer>(
+                *polymorphic_downcast<QuantizationLayerNode *>(node));
         case NodeType::ReductionOperationLayer:
-            return detail::validate_reduction_operation_layer<NEReductionOperation>(*polymorphic_downcast<ReductionLayerNode *>(node));
+            return detail::validate_reduction_operation_layer<NEReductionOperation>(
+                *polymorphic_downcast<ReductionLayerNode *>(node));
         case NodeType::ReorgLayer:
             return detail::validate_reorg_layer<NEReorgLayer>(*polymorphic_downcast<ReorgLayerNode *>(node));
         case NodeType::ReshapeLayer:
             return detail::validate_reshape_layer<NEReshapeLayer>(*polymorphic_downcast<ReshapeLayerNode *>(node));
         case NodeType::ROIAlignLayer:
-            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR, "Unsupported operation : ROIAlignLayer");
+            return ARM_COMPUTE_CREATE_ERROR(arm_compute::ErrorCode::RUNTIME_ERROR,
+                                            "Unsupported operation : ROIAlignLayer");
         case NodeType::SliceLayer:
             return detail::validate_slice_layer<NESlice>(*polymorphic_downcast<SliceLayerNode *>(node));
         case NodeType::StridedSliceLayer:
-            return detail::validate_strided_slice_layer<NEStridedSlice>(*polymorphic_downcast<StridedSliceLayerNode *>(node));
+            return detail::validate_strided_slice_layer<NEStridedSlice>(
+                *polymorphic_downcast<StridedSliceLayerNode *>(node));
         case NodeType::EltwiseLayer:
-            return detail::validate_eltwise_Layer<NEEltwiseLayerFunctions>(*polymorphic_downcast<EltwiseLayerNode *>(node));
+            return detail::validate_eltwise_Layer<NEEltwiseLayerFunctions>(
+                *polymorphic_downcast<EltwiseLayerNode *>(node));
         case NodeType::UnaryEltwiseLayer:
-            return detail::validate_unary_eltwise_layer<NEUnaryEltwiseLayerFunctions>(*polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
+            return detail::validate_unary_eltwise_layer<NEUnaryEltwiseLayerFunctions>(
+                *polymorphic_downcast<UnaryEltwiseLayerNode *>(node));
         default:
             return Status{};
     }

diff --git a/src/graph/backends/NEON/NESubTensorHandle.cpp b/src/graph/backends/NEON/NESubTensorHandle.cpp
index 36f29d0..8964a00 100644
--- a/src/graph/backends/NEON/NESubTensorHandle.cpp
+++ b/src/graph/backends/NEON/NESubTensorHandle.cpp

@@ -29,7 +29,10 @@
 {
 namespace backends
 {
-NESubTensorHandle::NESubTensorHandle(ITensorHandle *parent_handle, const TensorShape &shape, const Coordinates &coords, bool extend_parent)
+NESubTensorHandle::NESubTensorHandle(ITensorHandle     *parent_handle,
+                                     const TensorShape &shape,
+                                     const Coordinates &coords,
+                                     bool               extend_parent)
     : _sub_tensor(), _parent_handle(nullptr)
 {
     ARM_COMPUTE_ERROR_ON(!parent_handle);
@@ -95,4 +98,4 @@
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/backends/NEON/NETensorHandle.cpp b/src/graph/backends/NEON/NETensorHandle.cpp
index 4393156..dabf670 100644
--- a/src/graph/backends/NEON/NETensorHandle.cpp
+++ b/src/graph/backends/NEON/NETensorHandle.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/graph/backends/NEON/NETensorHandle.h"
 
 #include "arm_compute/runtime/MemoryGroup.h"
+
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -32,8 +33,7 @@
 {
 namespace backends
 {
-NETensorHandle::NETensorHandle(const ITensorInfo &info)
-    : _tensor()
+NETensorHandle::NETensorHandle(const ITensorInfo &info) : _tensor()
 {
     _tensor.allocator()->init(info);
 }
@@ -50,7 +50,7 @@
 
 void NETensorHandle::manage(IMemoryGroup *mg)
 {
-    if(mg != nullptr)
+    if (mg != nullptr)
     {
         mg->manage(&_tensor);
     }
@@ -68,7 +68,7 @@
 void NETensorHandle::release_if_unused()
 {
     // TODO (geopin01): Release tensor only if all sub-tensors are marked as not used
-    if(!_tensor.is_used())
+    if (!_tensor.is_used())
     {
         _tensor.allocator()->free();
     }
@@ -100,4 +100,4 @@
 }
 } // namespace backends
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
index b45f453..1e813dc 100644
--- a/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp
+++ b/src/graph/detail/CrossLayerMemoryManagerHelpers.cpp

@@ -23,6 +23,8 @@
  */
 #include "arm_compute/graph/detail/CrossLayerMemoryManagerHelpers.h"
 
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/GraphManager.h"
@@ -30,9 +32,7 @@
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Types.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 
-#include "arm_compute/core/ITensor.h"
 #include "support/Cast.h"
 
 #include <algorithm>
@@ -78,28 +78,28 @@
  */
 std::set<ITensorHandle *> get_const_handles(const Graph &g)
 {
-    std::set<NodeType> const_node_types = { NodeType::Input, NodeType::Output, NodeType::Const };
+    std::set<NodeType> const_node_types = {NodeType::Input, NodeType::Output, NodeType::Const};
 
     std::set<ITensorHandle *> const_tensors;
 
     auto &nodes = g.nodes();
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
         // If its a const node:
-        if(node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
+        if (node != nullptr && const_node_types.find(node->type()) != std::end(const_node_types))
         {
             // TODO (geopin01) : Create IO iterator wrappers
             // Add all its inputs / outputs to the list of constant handles
-            for(unsigned int i = 0; i < node->num_inputs(); ++i)
+            for (unsigned int i = 0; i < node->num_inputs(); ++i)
             {
-                if(node->input(i) != nullptr)
+                if (node->input(i) != nullptr)
                 {
                     const_tensors.insert(node->input(i)->handle()->parent_handle());
                 }
             }
-            for(unsigned int i = 0; i < node->num_outputs(); ++i)
+            for (unsigned int i = 0; i < node->num_outputs(); ++i)
             {
-                if(node->output(i) != nullptr)
+                if (node->output(i) != nullptr)
                 {
                     const_tensors.insert(node->output(i)->handle()->parent_handle());
                 }
@@ -118,9 +118,8 @@
  *
  * @return List of transition handles
  */
-TaskHandles get_transition_handles(GraphContext                    &ctx,
-                                   ExecutionTask                   &task,
-                                   const std::set<ITensorHandle *> &const_tensors)
+TaskHandles
+get_transition_handles(GraphContext &ctx, ExecutionTask &task, const std::set<ITensorHandle *> &const_tensors)
 {
     ARM_COMPUTE_ERROR_ON(task.node == nullptr || (task.task == nullptr && !is_utility_node(task.node)));
     INode &node = *task.node;
@@ -128,28 +127,30 @@
     TaskHandles transition_handles;
 
     // Add input handles
-    for(unsigned int i = 0; i < node.input_edges().size(); ++i)
+    for (unsigned int i = 0; i < node.input_edges().size(); ++i)
     {
         Edge *input_edge = node.input_edge(i);
         // If this input is the output of another node
-        if(input_edge != nullptr && input_edge->tensor() != nullptr && const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors))
+        if (input_edge != nullptr && input_edge->tensor() != nullptr &&
+            const_tensors.find(input_edge->tensor()->handle()->parent_handle()) == std::end(const_tensors))
         {
             // Then add it to the list of transition buffers
             ITensorHandle *tensor_handle = input_edge->tensor()->handle()->parent_handle();
-            IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
+            IMemoryGroup  *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
             transition_handles.input_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
         }
     }
 
     // Add output handles
-    for(unsigned int i = 0; i < node.num_outputs(); ++i)
+    for (unsigned int i = 0; i < node.num_outputs(); ++i)
     {
         Tensor *output_tensor = node.output(i);
         // If this output is used as an input for another node
-        if(output_tensor != nullptr && const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors))
+        if (output_tensor != nullptr &&
+            const_tensors.find(output_tensor->handle()->parent_handle()) == std::end(const_tensors))
         {
             ITensorHandle *tensor_handle = output_tensor->handle()->parent_handle();
-            IMemoryGroup *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
+            IMemoryGroup  *mm_group      = get_memory_group_from_handle(ctx, tensor_handle);
             transition_handles.output_handles.emplace_back(std::make_pair(tensor_handle, mm_group));
         }
     }
@@ -164,11 +165,11 @@
  */
 void count_input_handles_per_target(const TaskHandles &task_handles, TargetHandleCounter &handle_counter)
 {
-    for(const auto &handle : task_handles.input_handles)
+    for (const auto &handle : task_handles.input_handles)
     {
         ITensorHandle *key            = handle.first;
         HandleCounter &target_counter = handle_counter[key->target()];
-        if(target_counter.find(key) == std::end(target_counter))
+        if (target_counter.find(key) == std::end(target_counter))
         {
             target_counter.emplace(std::make_pair(key, 1));
         }
@@ -192,12 +193,12 @@
     // Acquires the given handles and sets them as in flight if they aren't already
     auto acquire = [&](std::vector<std::pair<ITensorHandle *, IMemoryGroup *>> &handles)
     {
-        for(auto &handle : handles)
+        for (auto &handle : handles)
         {
             ITensorHandle *parent_handle = handle.first;
             ARM_COMPUTE_ERROR_ON(parent_handle == nullptr);
             // If the tensor is not already in flight:
-            if(tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight))
+            if (tensors_in_flight.find(parent_handle) == std::end(tensors_in_flight))
             {
                 ARM_COMPUTE_ERROR_ON(hc.find(parent_handle) == std::end(hc));
                 // Then add it to the list of in flight tensors
@@ -208,20 +209,20 @@
         }
     };
 
-    for(auto &task_handle : tasks_handles)
+    for (auto &task_handle : tasks_handles)
     {
         // Marking all the input and output tensors of the task as in flight
         acquire(task_handle.input_handles);
         acquire(task_handle.output_handles);
 
         // Releasing the input tensors
-        for(auto &input_handle : task_handle.input_handles)
+        for (auto &input_handle : task_handle.input_handles)
         {
             ITensorHandle *ihandle = input_handle.first;
             ARM_COMPUTE_ERROR_ON(ihandle == nullptr);
             ARM_COMPUTE_ERROR_ON(tensors_in_flight.find(ihandle) == std::end(tensors_in_flight));
             --tensors_in_flight[ihandle];
-            if(tensors_in_flight[ihandle] <= 0)
+            if (tensors_in_flight[ihandle] <= 0)
             {
                 // Remove tensor for tensors in flight
                 tensors_in_flight.erase(ihandle);
@@ -242,7 +243,7 @@
     TargetHandleCounter      target_handle_count;
 
     // Count handles
-    for(auto &task : workload.tasks)
+    for (auto &task : workload.tasks)
     {
         // Populates IO handles
         tasks_handles.push_back(get_transition_handles(ctx, task, const_tensors));
@@ -252,12 +253,12 @@
     }
 
     // Setup memory managers
-    for(auto &hc : target_handle_count)
+    for (auto &hc : target_handle_count)
     {
         MemoryManagerContext *mm_ctx = ctx.memory_management_ctx(hc.first);
-        if(mm_ctx != nullptr)
+        if (mm_ctx != nullptr)
         {
-            if(mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr)
+            if (mm_ctx->cross_mm != nullptr && mm_ctx->cross_group != nullptr)
             {
                 // Manage and allocate tensors
                 configure_handle_lifetime(tasks_handles, hc.second);

diff --git a/src/graph/detail/ExecutionHelpers.cpp b/src/graph/detail/ExecutionHelpers.cpp
index ac800df..870d24a 100644
--- a/src/graph/detail/ExecutionHelpers.cpp
+++ b/src/graph/detail/ExecutionHelpers.cpp

@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/detail/ExecutionHelpers.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphContext.h"
 #include "arm_compute/graph/GraphManager.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 
 namespace arm_compute
 {
@@ -41,9 +41,9 @@
     auto &nodes = g.nodes();
 
     // Create tasks
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
-        if(node != nullptr)
+        if (node != nullptr)
         {
             Target                    assigned_target = node->assigned_target();
             backends::IDeviceBackend &backend         = backends::BackendRegistry::get().get_backend(assigned_target);
@@ -57,9 +57,9 @@
 {
     auto &tensors = g.tensors();
 
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor && tensor->handle() == nullptr)
+        if (tensor && tensor->handle() == nullptr)
         {
             Target                         target  = tensor->desc().target;
             backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(target);
@@ -72,10 +72,10 @@
 
 void allocate_all_input_tensors(INode &node)
 {
-    for(unsigned int i = 0; i < node.num_inputs(); ++i)
+    for (unsigned int i = 0; i < node.num_inputs(); ++i)
     {
         Tensor *tensor = node.input(i);
-        if(tensor != nullptr && !tensor->bound_edges().empty())
+        if (tensor != nullptr && !tensor->bound_edges().empty())
         {
             ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
             tensor->handle()->allocate();
@@ -85,10 +85,10 @@
 
 void allocate_all_output_tensors(INode &node)
 {
-    for(unsigned int i = 0; i < node.num_outputs(); ++i)
+    for (unsigned int i = 0; i < node.num_outputs(); ++i)
     {
         Tensor *tensor = node.output(i);
-        if(tensor != nullptr && !tensor->bound_edges().empty())
+        if (tensor != nullptr && !tensor->bound_edges().empty())
         {
             ARM_COMPUTE_ERROR_ON_MSG(!tensor->handle(), "Tensor handle is not configured!");
             tensor->handle()->allocate();
@@ -98,11 +98,11 @@
 
 void allocate_const_tensors(Graph &g)
 {
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node != nullptr)
+        if (node != nullptr)
         {
-            switch(node->type())
+            switch (node->type())
             {
                 case NodeType::Const:
                 case NodeType::Input:
@@ -121,9 +121,10 @@
 {
     auto &tensors = g.tensors();
 
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr && tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used())
+        if (tensor && !tensor->bound_edges().empty() && tensor->handle() != nullptr &&
+            tensor->handle()->tensor().info()->is_resizable() && tensor->handle()->tensor().is_used())
         {
             tensor->handle()->allocate();
         }
@@ -140,15 +141,15 @@
     workload.tasks.reserve(node_order.size());
 
     // Create tasks
-    for(auto &node_id : node_order)
+    for (auto &node_id : node_order)
     {
         auto node = g.node(node_id);
-        if(node != nullptr)
+        if (node != nullptr)
         {
             Target                     assigned_target = node->assigned_target();
-            backends::IDeviceBackend &backend         = backends::BackendRegistry::get().get_backend(assigned_target);
+            backends::IDeviceBackend  &backend         = backends::BackendRegistry::get().get_backend(assigned_target);
             std::unique_ptr<IFunction> func            = backend.configure_node(*node, ctx);
-            if(func != nullptr || is_utility_node(node))
+            if (func != nullptr || is_utility_node(node))
             {
                 workload.tasks.emplace_back(ExecutionTask(std::move(func), node));
             }
@@ -156,14 +157,14 @@
     }
 
     // Add inputs and outputs
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node != nullptr && node->type() == NodeType::Input)
+        if (node != nullptr && node->type() == NodeType::Input)
         {
             workload.inputs.push_back(node->output(0));
         }
 
-        if(node != nullptr && node->type() == NodeType::Output)
+        if (node != nullptr && node->type() == NodeType::Output)
         {
             workload.outputs.push_back(node->input(0));
             continue;
@@ -175,9 +176,9 @@
 
 void release_unused_tensors(Graph &g)
 {
-    for(auto &tensor : g.tensors())
+    for (auto &tensor : g.tensors())
     {
-        if(tensor != nullptr && tensor->handle() != nullptr)
+        if (tensor != nullptr && tensor->handle() != nullptr)
         {
             tensor->handle()->release_if_unused();
         }
@@ -194,11 +195,11 @@
 {
     auto &nodes = g.nodes();
 
-    for(auto &node : nodes)
+    for (auto &node : nodes)
     {
-        if(node != nullptr && node->type() == NodeType::Const && node->num_outputs())
+        if (node != nullptr && node->type() == NodeType::Const && node->num_outputs())
         {
-            if(!node->output(0)->bound_edges().empty())
+            if (!node->output(0)->bound_edges().empty())
             {
                 call_tensor_accessor(node->output(0));
             }
@@ -209,18 +210,19 @@
 bool call_all_input_node_accessors(ExecutionWorkload &workload)
 {
     bool is_valid = true;
-    std::for_each(std::begin(workload.inputs), std::end(workload.inputs), [&](Tensor * input_tensor)
-    {
-        bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor();
-        is_valid         = is_valid && valid_input;
-    });
+    std::for_each(std::begin(workload.inputs), std::end(workload.inputs),
+                  [&](Tensor *input_tensor)
+                  {
+                      bool valid_input = (input_tensor != nullptr) && input_tensor->call_accessor();
+                      is_valid         = is_valid && valid_input;
+                  });
     return is_valid;
 }
 
 void prepare_all_tasks(ExecutionWorkload &workload)
 {
     ARM_COMPUTE_ERROR_ON(workload.graph == nullptr);
-    for(auto &task : workload.tasks)
+    for (auto &task : workload.tasks)
     {
         task.prepare();
         release_unused_tensors(*workload.graph);
@@ -232,24 +234,24 @@
     ARM_COMPUTE_ERROR_ON(workload.ctx == nullptr);
 
     // Acquire memory for the transition buffers
-    for(auto &mm_ctx : workload.ctx->memory_managers())
+    for (auto &mm_ctx : workload.ctx->memory_managers())
     {
-        if(mm_ctx.second.cross_group != nullptr)
+        if (mm_ctx.second.cross_group != nullptr)
         {
             mm_ctx.second.cross_group->acquire();
         }
     }
 
     // Execute tasks
-    for(auto &task : workload.tasks)
+    for (auto &task : workload.tasks)
     {
         task();
     }
 
     // Release memory for the transition buffers
-    for(auto &mm_ctx : workload.ctx->memory_managers())
+    for (auto &mm_ctx : workload.ctx->memory_managers())
     {
-        if(mm_ctx.second.cross_group != nullptr)
+        if (mm_ctx.second.cross_group != nullptr)
         {
             mm_ctx.second.cross_group->release();
         }
@@ -259,11 +261,12 @@
 bool call_all_output_node_accessors(ExecutionWorkload &workload)
 {
     bool is_valid = true;
-    std::for_each(std::begin(workload.outputs), std::end(workload.outputs), [&](Tensor * output_tensor)
-    {
-        bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor();
-        is_valid          = is_valid && valid_output;
-    });
+    std::for_each(std::begin(workload.outputs), std::end(workload.outputs),
+                  [&](Tensor *output_tensor)
+                  {
+                      bool valid_output = (output_tensor != nullptr) && output_tensor->call_accessor();
+                      is_valid          = is_valid && valid_output;
+                  });
 
     sync_backends();
 

diff --git a/src/graph/frontend/Stream.cpp b/src/graph/frontend/Stream.cpp
index 44c8400..383a6dc 100644
--- a/src/graph/frontend/Stream.cpp
+++ b/src/graph/frontend/Stream.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/frontend/Stream.h"
 
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/frontend/ILayer.h"
+#include "arm_compute/graph/Utils.h"
 
 namespace arm_compute
 {
@@ -32,8 +32,7 @@
 {
 namespace frontend
 {
-Stream::Stream(size_t id, std::string name)
-    : _ctx(), _manager(), _g(id, std::move(name))
+Stream::Stream(size_t id, std::string name) : _ctx(), _manager(), _g(id, std::move(name))
 {
 }
 

diff --git a/src/graph/frontend/SubStream.cpp b/src/graph/frontend/SubStream.cpp
index 4b42207..8596aaa 100644
--- a/src/graph/frontend/SubStream.cpp
+++ b/src/graph/frontend/SubStream.cpp

@@ -23,8 +23,8 @@
  */
 #include "arm_compute/graph/frontend/SubStream.h"
 
-#include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/frontend/ILayer.h"
+#include "arm_compute/graph/Graph.h"
 
 namespace arm_compute
 {
@@ -32,8 +32,7 @@
 {
 namespace frontend
 {
-SubStream::SubStream(IStream &s)
-    : _s(s)
+SubStream::SubStream(IStream &s) : _s(s)
 {
     _hints     = s.hints();
     _tail_node = s.tail_node();

diff --git a/src/graph/mutators/DepthConcatSubTensorMutator.cpp b/src/graph/mutators/DepthConcatSubTensorMutator.cpp
index 963b948..1b7ee3c 100644
--- a/src/graph/mutators/DepthConcatSubTensorMutator.cpp
+++ b/src/graph/mutators/DepthConcatSubTensorMutator.cpp

@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/mutators/DepthConcatSubTensorMutator.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 #include "support/Iterable.h"
@@ -50,7 +50,7 @@
 void DepthConcatSubTensorMutator::mutate(Graph &g)
 {
     // Early exit if no Concatenation layers exist in graph
-    if(g.nodes(NodeType::ConcatenateLayer).empty())
+    if (g.nodes(NodeType::ConcatenateLayer).empty())
     {
         return;
     }
@@ -59,43 +59,48 @@
     std::vector<NodeID> topological_sorted_node_ids = dfs(g);
 
     // Should be in reverse order of execution
-    for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
+    for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
     {
         INode *node = g.node(node_id);
-        if(node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr)
+        if (node != nullptr && node->type() == NodeType::ConcatenateLayer && node->output(0) != nullptr)
         {
             // Get output tensor
             auto output_tensor = node->output(0);
 
             // Check concatenation axis (Sub-tensor optimization is supported for concatenation axis >=2)
             auto *concat_node = arm_compute::utils::cast::polymorphic_downcast<ConcatenateLayerNode *>(node);
-            if(output_tensor == nullptr || get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
+            if (output_tensor == nullptr ||
+                get_dimension_idx(output_tensor->desc().layout, concat_node->concatenation_axis()) < 2)
             {
                 continue;
             }
 
             // Check that all tensor have the same target, valid inputs and same quantization info
-            bool is_valid = std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
-                                        [&](const EdgeID & eid)
-            {
-                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) && (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target)
-                       && (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
-            });
+            bool is_valid =
+                std::all_of(node->input_edges().cbegin(), node->input_edges().cend(),
+                            [&](const EdgeID &eid)
+                            {
+                                return (g.edge(eid) != nullptr) && (g.edge(eid)->tensor() != nullptr) &&
+                                       (g.edge(eid)->tensor()->desc().target == output_tensor->desc().target) &&
+                                       (g.edge(eid)->tensor()->desc().quant_info == output_tensor->desc().quant_info);
+                            });
 
             // Create subtensors
-            if(is_valid && is_target_supported(output_tensor->desc().target))
+            if (is_valid && is_target_supported(output_tensor->desc().target))
             {
                 ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
                                               << node->id() << " and name : " << node->name() << std::endl);
                 // Create sub-tensor handles
                 unsigned depth = 0;
-                for(unsigned int i = 0; i < node->input_edges().size(); ++i)
+                for (unsigned int i = 0; i < node->input_edges().size(); ++i)
                 {
                     auto       input_tensor = node->input(i);
                     const auto input_shape  = input_tensor->desc().shape;
 
-                    backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(input_tensor->desc().target);
-                    std::unique_ptr<ITensorHandle> handle  = backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
+                    backends::IDeviceBackend &backend =
+                        backends::BackendRegistry::get().get_backend(input_tensor->desc().target);
+                    std::unique_ptr<ITensorHandle> handle =
+                        backend.create_subtensor(output_tensor->handle(), input_shape, Coordinates(0, 0, depth), false);
                     input_tensor->set_handle(std::move(handle));
 
                     depth += input_shape.z();

diff --git a/src/graph/mutators/GroupedConvolutionMutator.cpp b/src/graph/mutators/GroupedConvolutionMutator.cpp
index b7c551c..31efba6 100644
--- a/src/graph/mutators/GroupedConvolutionMutator.cpp
+++ b/src/graph/mutators/GroupedConvolutionMutator.cpp

@@ -23,15 +23,14 @@
  */
 #include "arm_compute/graph/mutators/GroupedConvolutionMutator.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
-
 #include "support/StringSupport.h"
 
 #include <set>
@@ -42,43 +41,51 @@
 {
 namespace
 {
-NodeID create_grouped_convolution(Graph &g, const NodeParams &params, NodeIdxPair input, NodeID weights, NodeID bias,
-                                  PadStrideInfo conv_info, ConvolutionMethod method, ActivationLayerInfo fused_act, FastMathHint fast_math_hint, unsigned int num_groups)
+NodeID create_grouped_convolution(Graph              &g,
+                                  const NodeParams   &params,
+                                  NodeIdxPair         input,
+                                  NodeID              weights,
+                                  NodeID              bias,
+                                  PadStrideInfo       conv_info,
+                                  ConvolutionMethod   method,
+                                  ActivationLayerInfo fused_act,
+                                  FastMathHint        fast_math_hint,
+                                  unsigned int        num_groups)
 {
     bool has_bias = (bias != EmptyNodeID);
 
     // Split input
     const TensorDescriptor input_tensor_desc = get_tensor_descriptor(g, g.node(input.node_id)->outputs()[0]);
-    const unsigned int     input_idx         = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
-    NodeID                 input_split       = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
+    const unsigned int     input_idx   = get_dimension_idx(input_tensor_desc.layout, DataLayoutDimension::CHANNEL);
+    NodeID                 input_split = GraphBuilder::add_split_node(g, params, input, num_groups, input_idx);
 
     // Split weights
     const TensorDescriptor weights_tensor_desc = get_tensor_descriptor(g, g.node(weights)->outputs()[0]);
-    const unsigned int     batch_idx           = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
-    NodeID                 weights_split       = GraphBuilder::add_split_node(g, params, { weights, 0 }, num_groups, batch_idx);
+    const unsigned int     batch_idx     = get_dimension_idx(weights_tensor_desc.layout, DataLayoutDimension::BATCHES);
+    NodeID                 weights_split = GraphBuilder::add_split_node(g, params, {weights, 0}, num_groups, batch_idx);
 
     // Split bias
     NodeID bias_split = EmptyNodeID;
-    if(has_bias)
+    if (has_bias)
     {
         // Split bias
-        bias_split = GraphBuilder::add_split_node(g, params, { bias, 0 }, num_groups, 0);
+        bias_split = GraphBuilder::add_split_node(g, params, {bias, 0}, num_groups, 0);
     }
 
     std::vector<NodeIdxPair> convolution_outputs;
-    for(unsigned int i = 0; i < num_groups; ++i)
+    for (unsigned int i = 0; i < num_groups; ++i)
     {
         NodeParams group_params = params;
         NodeID     conv_nid     = g.add_node<ConvolutionLayerNode>(conv_info, 1, method, fast_math_hint);
         g.add_connection(input_split, i, conv_nid, 0);
         g.add_connection(weights_split, i, conv_nid, 1);
-        if(has_bias)
+        if (has_bias)
         {
             g.add_connection(bias_split, i, conv_nid, 2);
         }
 
         // Add group name
-        if(!group_params.name.empty())
+        if (!group_params.name.empty())
         {
             group_params.name.append("_g" + arm_compute::support::cpp11::to_string(i));
         }
@@ -92,7 +99,7 @@
         auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
         conv_node->set_fused_activation(fused_act);
 
-        convolution_outputs.push_back({ conv_nid, 0 });
+        convolution_outputs.push_back({conv_nid, 0});
     }
 
     // Depth concatenate output
@@ -113,7 +120,7 @@
 void GroupedConvolutionMutator::mutate(Graph &g)
 {
     // Early exit if no Convolution layers exist in graph
-    if(g.nodes(NodeType::ConvolutionLayer).empty())
+    if (g.nodes(NodeType::ConvolutionLayer).empty())
     {
         return;
     }
@@ -122,17 +129,18 @@
     size_t total_nodes = g.nodes().size();
 
     // Iterate over convolution nodes
-    for(unsigned int i = 0; i < total_nodes; ++i)
+    for (unsigned int i = 0; i < total_nodes; ++i)
     {
         INode *node = g.node(i);
-        if(node != nullptr && node->type() == NodeType::ConvolutionLayer && arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node)->num_groups() != 1)
+        if (node != nullptr && node->type() == NodeType::ConvolutionLayer &&
+            arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node)->num_groups() != 1)
         {
             // Validate node
             backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target());
             Status                    status  = backend.validate_node(*node);
 
             // If grouped convolution is not supported
-            if(!bool(status))
+            if (!bool(status))
             {
                 // Down-cast node
                 auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(node);
@@ -151,7 +159,8 @@
                 ARM_COMPUTE_ERROR_ON(conv_node->input_edge(0) == nullptr || conv_node->input_edge(1) == nullptr);
                 const NodeID input_id   = conv_node->input_edge(0)->producer()->id();
                 const NodeID weights_id = conv_node->input_edge(1)->producer()->id();
-                const NodeID bias_id    = (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
+                const NodeID bias_id =
+                    (conv_node->input_edge(2) != nullptr) ? conv_node->input_edge(2)->producer()->id() : EmptyNodeID;
 
                 // Get driving nodes
                 std::vector<NodeIdxPair> driving_nodes = get_driving_nodes(*node);
@@ -164,14 +173,15 @@
                 NodeID   latest_nid = g.nodes().size();
 
                 // Create grouped convolution node
-                NodeID grouped_conv_id = create_grouped_convolution(g, params, { input_id, 0 }, weights_id, bias_id,
-                                                                    conv_info, conv_method, fused_act_info, fast_math_hint, num_groups);
+                NodeID grouped_conv_id =
+                    create_grouped_convolution(g, params, {input_id, 0}, weights_id, bias_id, conv_info, conv_method,
+                                               fused_act_info, fast_math_hint, num_groups);
 
                 // Remove convolution node
                 g.remove_node(node->id());
 
                 // Update batch normalization node outputs
-                for(auto &driving_node : driving_nodes)
+                for (auto &driving_node : driving_nodes)
                 {
                     g.add_connection(grouped_conv_id, 0, driving_node.node_id, driving_node.index);
                 }
@@ -180,17 +190,16 @@
                 g.node(grouped_conv_id)->output(0)->set_accessor(std::move(node_accessor));
 
                 // Configure new tensors and nodes
-                std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(), [](std::unique_ptr<Tensor> &t)
-                {
-                    configure_tensor(t.get());
-                });
-                std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(), [&assigned_target](std::unique_ptr<INode> &n)
-                {
-                    if(n != nullptr)
-                    {
-                        n->set_assigned_target(assigned_target);
-                    }
-                });
+                std::for_each(g.tensors().begin() + latest_tid, g.tensors().end(),
+                              [](std::unique_ptr<Tensor> &t) { configure_tensor(t.get()); });
+                std::for_each(g.nodes().begin() + latest_nid, g.nodes().end(),
+                              [&assigned_target](std::unique_ptr<INode> &n)
+                              {
+                                  if (n != nullptr)
+                                  {
+                                      n->set_assigned_target(assigned_target);
+                                  }
+                              });
             }
         }
     }

diff --git a/src/graph/mutators/InPlaceOperationMutator.cpp b/src/graph/mutators/InPlaceOperationMutator.cpp
index d3ea940..a51dcc4 100644
--- a/src/graph/mutators/InPlaceOperationMutator.cpp
+++ b/src/graph/mutators/InPlaceOperationMutator.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/DepthwiseConvolutionLayerNode.h"
 #include "arm_compute/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.h"
+
 #include "support/Cast.h"
 
 using namespace arm_compute::utils::cast;
@@ -48,7 +49,7 @@
     const auto input_tensor  = input_edge->tensor();
     const auto input_edge_id = input_edge->id();
 
-    if(parent_node == nullptr)
+    if (parent_node == nullptr)
     {
         return false;
     }
@@ -57,24 +58,23 @@
 
     // If the output is connected to only one edge, then computations can
     // be done in-place.
-    if(output_edges.size() == 1)
+    if (output_edges.size() == 1)
     {
         return true;
     }
 
-    return std::all_of(output_edges.begin(),
-                       output_edges.end(),
-                       [&](const EdgeID & edge_id)
-    {
-        // Skip check on current input edge
-        if(edge_id == input_edge_id)
-        {
-            return true;
-        }
+    return std::all_of(output_edges.begin(), output_edges.end(),
+                       [&](const EdgeID &edge_id)
+                       {
+                           // Skip check on current input edge
+                           if (edge_id == input_edge_id)
+                           {
+                               return true;
+                           }
 
-        auto edge = g.edge(edge_id);
-        return edge->tensor() != input_tensor;
-    });
+                           auto edge = g.edge(edge_id);
+                           return edge->tensor() != input_tensor;
+                       });
 }
 
 // If do in-place calculation, then need to use the new output and inherit original output's accessor
@@ -109,12 +109,14 @@
     // Extract PadStrideInfo and depth multiplier
     PadStrideInfo conv_info{};
     unsigned int  depth_multiplier{};
-    if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer)
+    if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer)
     {
-        conv_info        = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->convolution_info();
-        depth_multiplier = polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->depth_multiplier();
+        conv_info =
+            polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->convolution_info();
+        depth_multiplier =
+            polymorphic_downcast<FusedDepthwiseConvolutionBatchNormalizationNode *>(node.get())->depth_multiplier();
     }
-    else if(node->type() == NodeType::DepthwiseConvolutionLayer)
+    else if (node->type() == NodeType::DepthwiseConvolutionLayer)
     {
         conv_info        = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->convolution_info();
         depth_multiplier = polymorphic_downcast<DepthwiseConvolutionLayerNode *>(node.get())->depth_multiplier();
@@ -126,7 +128,8 @@
     const auto out_shape = current_output_tensor->desc().shape;
     const auto qinfo_out = current_output_tensor->desc().quant_info;
 
-    bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) && (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr);
+    bool input_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, input_shape, 0) &&
+                              (qinfo_input == qinfo_out) && (input_tensor->accessor() == nullptr);
 
     // Specify conditions with which input can be in-placed
     input_can_in_place &= weight_layout == input_tensor->desc().layout && weight_layout == DataLayout::NHWC;
@@ -141,13 +144,14 @@
     input_can_in_place &= !conv_info.has_padding();
     // NOTE: Dilation should also be (1, 1). However currently dilation is not supported in the depthwise conv node
 
-    if(input_can_in_place)
+    if (input_can_in_place)
     {
         set_new_output_and_inherit_accessor(node, current_output_tensor, input_tensor);
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor "
+                                      "or the quantization info are different.\n");
     }
 }
 
@@ -170,7 +174,7 @@
 
     const TensorShape out_shape = TensorShape::broadcast_shape(shape0, shape1);
     // Inputs are not broadcast compatible
-    if(out_shape.total_size() == 0)
+    if (out_shape.total_size() == 0)
     {
         return;
     }
@@ -181,22 +185,27 @@
     const auto qinfo_out = current_output_tensor->desc().quant_info;
 
     // Can do in place, if the input has same shape as output, has same quntisation info as output, has same data type as output and input doesn't have accessor.
-    bool input0_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) && (qinfo0 == qinfo_out)
-                               && (input0_tensor->desc().data_type == current_output_tensor->desc().data_type) && (input0_tensor->accessor() == nullptr);
-    bool input1_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) && (qinfo1 == qinfo_out)
-                               && (input1_tensor->desc().data_type == current_output_tensor->desc().data_type) && (input1_tensor->accessor() == nullptr);
+    bool input0_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape0, 0) &&
+                               (qinfo0 == qinfo_out) &&
+                               (input0_tensor->desc().data_type == current_output_tensor->desc().data_type) &&
+                               (input0_tensor->accessor() == nullptr);
+    bool input1_can_in_place = !arm_compute::detail::have_different_dimensions(out_shape, shape1, 0) &&
+                               (qinfo1 == qinfo_out) &&
+                               (input1_tensor->desc().data_type == current_output_tensor->desc().data_type) &&
+                               (input1_tensor->accessor() == nullptr);
 
-    if(input0_can_in_place)
+    if (input0_can_in_place)
     {
         set_new_output_and_inherit_accessor(node, current_output_tensor, input0_tensor);
     }
-    else if(input1_can_in_place)
+    else if (input1_can_in_place)
     {
         set_new_output_and_inherit_accessor(node, current_output_tensor, input1_tensor);
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor "
+                                      "or the quantization info are different.\n");
     }
 }
 } // namespace
@@ -213,33 +222,31 @@
 
 void InPlaceOperationMutator::mutate(Graph &g)
 {
-    std::set<NodeType> in_place_nodes =
-    {
-        NodeType::ActivationLayer,
-        NodeType::BatchNormalizationLayer,
-        NodeType::EltwiseLayer,
-        NodeType::UnaryEltwiseLayer,
-        NodeType::DepthwiseConvolutionLayer,
-        NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer,
-        NodeType::PrintLayer
-    };
+    std::set<NodeType> in_place_nodes = {NodeType::ActivationLayer,
+                                         NodeType::BatchNormalizationLayer,
+                                         NodeType::EltwiseLayer,
+                                         NodeType::UnaryEltwiseLayer,
+                                         NodeType::DepthwiseConvolutionLayer,
+                                         NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer,
+                                         NodeType::PrintLayer};
 
     // Not interested in the order of nodes
-    for(auto &node : g.nodes())
+    for (auto &node : g.nodes())
     {
-        if(node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
+        if (node && in_place_nodes.find(node->type()) != std::end(in_place_nodes))
         {
             // Get input edge
             Edge *input_edge = node->input_edge(0);
 
             // Check if parent has a single output if yes then force in place calculation else not
-            if((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge))
+            if ((input_edge != nullptr) && output_edges_are_separate_tensors(g, input_edge))
             {
-                if(node->type() == NodeType::EltwiseLayer)
+                if (node->type() == NodeType::EltwiseLayer)
                 {
                     try_in_place_elementwise(node);
                 }
-                else if(node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer || node->type() == NodeType::DepthwiseConvolutionLayer)
+                else if (node->type() == NodeType::FusedDepthwiseConvolutionBatchNormalizationLayer ||
+                         node->type() == NodeType::DepthwiseConvolutionLayer)
                 {
                     try_in_place_depthwiseconv(node);
                 }
@@ -252,9 +259,11 @@
                     ARM_COMPUTE_ERROR_ON(current_output_tensor == nullptr || new_output_tensor == nullptr);
 
                     // Prevent in-place operation if there is an accessor bound to the in-place tensor or quantization info are different
-                    if(new_output_tensor->accessor() != nullptr || current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info)
+                    if (new_output_tensor->accessor() != nullptr ||
+                        current_output_tensor->desc().quant_info != new_output_tensor->desc().quant_info)
                     {
-                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to the input tensor or the quantization info are different.\n");
+                        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented in-place operation as there is an accessor bound to "
+                                                      "the input tensor or the quantization info are different.\n");
                     }
                     else
                     {

diff --git a/src/graph/mutators/MutatorUtils.cpp b/src/graph/mutators/MutatorUtils.cpp
index c8f38f3..f47240e 100644
--- a/src/graph/mutators/MutatorUtils.cpp
+++ b/src/graph/mutators/MutatorUtils.cpp

@@ -29,14 +29,14 @@
 {
 bool is_padding_in_height_or_width(const DataLayout &layout, const PaddingList &padding_list)
 {
-    if(layout == DataLayout::NCHW || layout == DataLayout::NHWC)
+    if (layout == DataLayout::NCHW || layout == DataLayout::NHWC)
     {
         const unsigned int height_index = get_dimension_idx(layout, DataLayoutDimension::HEIGHT);
         const unsigned int width_index  = get_dimension_idx(layout, DataLayoutDimension::WIDTH);
 
-        for(unsigned int i = 0; i < padding_list.size(); ++i)
+        for (unsigned int i = 0; i < padding_list.size(); ++i)
         {
-            if(i != height_index && i != width_index && padding_list[i] != PaddingInfo(0, 0))
+            if (i != height_index && i != width_index && padding_list[i] != PaddingInfo(0, 0))
             {
                 // if the index is not either height or width, don't fuse
                 return false;
@@ -49,4 +49,4 @@
     return false;
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/mutators/NodeExecutionMethodMutator.cpp b/src/graph/mutators/NodeExecutionMethodMutator.cpp
index 09a3cf5..588befe 100644
--- a/src/graph/mutators/NodeExecutionMethodMutator.cpp
+++ b/src/graph/mutators/NodeExecutionMethodMutator.cpp

@@ -23,11 +23,11 @@
  */
 #include "arm_compute/graph/mutators/NodeExecutionMethodMutator.h"
 
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 
@@ -49,17 +49,17 @@
 void set_default_on_invalid_method(Graph &g, NodeType node_type, Setter &&setter)
 {
     const std::vector<NodeID> &node_ids = g.nodes(node_type);
-    for(auto &node_id : node_ids)
+    for (auto &node_id : node_ids)
     {
         INode *node = g.node(node_id);
-        if(node != nullptr)
+        if (node != nullptr)
         {
             // Validate node
             backends::IDeviceBackend &backend = backends::BackendRegistry::get().get_backend(node->assigned_target());
             Status                    status  = backend.validate_node(*node);
 
             // Set default execution method in case of failure
-            if(!bool(status))
+            if (!bool(status))
             {
                 setter(node);
             }
@@ -81,22 +81,26 @@
 void NodeExecutionMethodMutator::mutate(Graph &g)
 {
     // Convolution Layer
-    set_default_on_invalid_method(g, NodeType::ConvolutionLayer, [](INode * n)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : "
-                                   << n->id() << " and Name: " << n->name() << std::endl);
-        auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(n);
-        casted_node->set_convolution_method(ConvolutionMethod::Default);
-    });
+    set_default_on_invalid_method(g, NodeType::ConvolutionLayer,
+                                  [](INode *n)
+                                  {
+                                      ARM_COMPUTE_LOG_GRAPH_INFO("Switched ConvolutionLayer method of node with ID : "
+                                                                 << n->id() << " and Name: " << n->name() << std::endl);
+                                      auto *casted_node =
+                                          arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(n);
+                                      casted_node->set_convolution_method(ConvolutionMethod::Default);
+                                  });
 
     // Depthwise Convolution Layer
-    set_default_on_invalid_method(g, NodeType::DepthwiseConvolutionLayer, [](INode * n)
-    {
-        ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : "
-                                   << n->id() << " and Name: " << n->name() << std::endl);
-        auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(n);
-        casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default);
-    });
+    set_default_on_invalid_method(
+        g, NodeType::DepthwiseConvolutionLayer,
+        [](INode *n)
+        {
+            ARM_COMPUTE_LOG_GRAPH_INFO("Switched Depthwise ConvolutionLayer method of node with ID : "
+                                       << n->id() << " and Name: " << n->name() << std::endl);
+            auto *casted_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(n);
+            casted_node->set_depthwise_convolution_method(DepthwiseConvolutionMethod::Default);
+        });
 }
 } // namespace graph
 } // namespace arm_compute

diff --git a/src/graph/mutators/NodeFusionMutator.cpp b/src/graph/mutators/NodeFusionMutator.cpp
index 38284b9..998a4a0 100644
--- a/src/graph/mutators/NodeFusionMutator.cpp
+++ b/src/graph/mutators/NodeFusionMutator.cpp

@@ -24,15 +24,14 @@
 #include "arm_compute/graph/mutators/NodeFusionMutator.h"
 
 #include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
-#include "arm_compute/graph/backends/BackendRegistry.h"
 #include "arm_compute/graph/nodes/FusedConvolutionBatchNormalizationNode.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "src/graph/mutators/MutatorUtils.h"
-
 #include "support/Cast.h"
 
 #include <list>
@@ -46,7 +45,7 @@
 {
 void transfer_driving_nodes_and_remove_old_node(Graph &g, INode *new_node, INode *old_node, bool add_output_tensor)
 {
-    if(new_node == nullptr || old_node == nullptr)
+    if (new_node == nullptr || old_node == nullptr)
     {
         return;
     }
@@ -55,7 +54,7 @@
     std::vector<NodeIdxPair> last_driving_nodes = get_driving_nodes(*old_node);
 
     // Extract last fusable node accessor if any
-    if(old_node->output(0) == nullptr)
+    if (old_node->output(0) == nullptr)
     {
         return;
     }
@@ -65,10 +64,10 @@
     g.remove_node(old_node->id());
 
     // Update fused node outputs
-    for(auto &driving_node : last_driving_nodes)
+    for (auto &driving_node : last_driving_nodes)
     {
         g.add_connection(new_node->id(), 0, driving_node.node_id, driving_node.index);
-        if(add_output_tensor)
+        if (add_output_tensor)
         {
             configure_tensor(new_node->output(0));
         }
@@ -83,19 +82,21 @@
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
     auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<ConvolutionLayerNode *>(output_edge->producer());
-    auto *bn_node   = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+    auto *bn_node =
+        arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
 
     // Not fusing if number of groups is greater than 1
-    if(conv_node->num_groups() > 1)
+    if (conv_node->num_groups() > 1)
     {
         return;
     }
 
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : " << output_edge->producer_id()
-                                  << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing convolution node with ID : "
+                                  << output_edge->producer_id() << " with BatchNormalization Layer node with ID : "
+                                  << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(conv_node->output(0)->accessor() == nullptr)
+    if (conv_node->output(0)->accessor() == nullptr)
     {
         const Target assigned_target = conv_node->assigned_target();
 
@@ -115,9 +116,10 @@
         const auto epsilon = bn_node->epsilon();
 
         // Create the fused node
-        const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info);
+        const NodeID fused_id = g.add_node<FusedConvolutionBatchNormalizationNode>(
+            epsilon, conv_info, num_groups, conv_method, fast_math_hint, act_info);
 
-        if(conv_node->input_edge(2) != nullptr)
+        if (conv_node->input_edge(2) != nullptr)
         {
             auto conv_bias_id = conv_node->input_edge(2)->producer_id();
             g.add_connection(conv_bias_id, 0, fused_id, 2);
@@ -129,13 +131,13 @@
         g.add_connection(bn_mean_id, 0, fused_id, 3);
         g.add_connection(bn_var_id, 0, fused_id, 4);
 
-        if(bn_node->input_edge(3) != nullptr)
+        if (bn_node->input_edge(3) != nullptr)
         {
             const auto bn_beta_id = bn_node->input_edge(3)->producer_id();
             g.add_connection(bn_beta_id, 0, fused_id, 5);
         }
 
-        if(bn_node->input_edge(4) != nullptr)
+        if (bn_node->input_edge(4) != nullptr)
         {
             const auto bn_gamma_id = bn_node->input_edge(4)->producer_id();
             g.add_connection(bn_gamma_id, 0, fused_id, 6);
@@ -147,14 +149,15 @@
         transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true);
 
         fused_node->set_assigned_target(assigned_target);
-        fused_node->set_common_node_parameters(NodeParams{ conv_node->name() + "+" + bn_node_name, assigned_target });
+        fused_node->set_common_node_parameters(NodeParams{conv_node->name() + "+" + bn_node_name, assigned_target});
 
         // Remove convolution node
         g.remove_node(conv_node->id());
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+            "Prevented fusion of convolution with batch normalization due to the presence of an output accessor\n");
     }
 }
 
@@ -162,14 +165,17 @@
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
-    auto *depth_conv_node = arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(output_edge->producer());
-    auto *bn_node         = arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
+    auto *depth_conv_node =
+        arm_compute::utils::cast::polymorphic_downcast<DepthwiseConvolutionLayerNode *>(output_edge->producer());
+    auto *bn_node =
+        arm_compute::utils::cast::polymorphic_downcast<BatchNormalizationLayerNode *>(output_edge->consumer());
 
-    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : " << output_edge->producer_id()
-                                  << " with BatchNormalization Layer node with ID : " << output_edge->consumer_id() << std::endl);
+    ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing depthwise convolution node with ID : "
+                                  << output_edge->producer_id() << " with BatchNormalization Layer node with ID : "
+                                  << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(depth_conv_node->output(0)->accessor() == nullptr)
+    if (depth_conv_node->output(0)->accessor() == nullptr)
     {
         const Target assigned_target = depth_conv_node->assigned_target();
 
@@ -189,9 +195,10 @@
         const auto epsilon     = bn_node->epsilon();
 
         // Create the fused node
-        const NodeID fused_id = g.add_node<FusedDepthwiseConvolutionBatchNormalizationNode>(epsilon, conv_info, depth_multiplier, depth_conv_method, act_info);
+        const NodeID fused_id = g.add_node<FusedDepthwiseConvolutionBatchNormalizationNode>(
+            epsilon, conv_info, depth_multiplier, depth_conv_method, act_info);
 
-        if(depth_conv_node->input_edge(2) != nullptr)
+        if (depth_conv_node->input_edge(2) != nullptr)
         {
             const auto conv_bias_id = depth_conv_node->input_edge(2)->producer_id();
             g.add_connection(conv_bias_id, 0, fused_id, 2);
@@ -211,19 +218,23 @@
         transfer_driving_nodes_and_remove_old_node(g, fused_node, bn_node, true);
 
         fused_node->set_assigned_target(assigned_target);
-        fused_node->set_common_node_parameters(NodeParams{ depth_conv_node->name() + "+" + bn_node_name, assigned_target });
+        fused_node->set_common_node_parameters(
+            NodeParams{depth_conv_node->name() + "+" + bn_node_name, assigned_target});
 
         // Remove convolution node
         g.remove_node(depth_conv_node->id());
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of depthwise convolution with batch normalization due to the "
+                                      "presence of an output accessor\n");
     }
 }
 
 template <typename N>
-void fuse_node_with_activation(Graph &g, const Edge *output_edge, const std::set<Activation> &supported_fused_activations)
+void fuse_node_with_activation(Graph                      &g,
+                               const Edge                 *output_edge,
+                               const std::set<Activation> &supported_fused_activations)
 {
     ARM_COMPUTE_ERROR_ON(output_edge == nullptr);
 
@@ -233,22 +244,23 @@
     ARM_COMPUTE_ERROR_ON(act_node->output(0) == nullptr || n_node->output(0) == nullptr);
 
     // Check if activation is supported for fusion
-    if(supported_fused_activations.count(act_node->activation_info().activation()) == 0)
+    if (supported_fused_activations.count(act_node->activation_info().activation()) == 0)
     {
         return;
     }
 
     // EltwiseLayerNode can only be fused when dataype is float
-    if(n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type))
+    if (n_node->type() == NodeType::EltwiseLayer && !is_data_type_float(n_node->output(0)->desc().data_type))
     {
         return;
     }
 
     ARM_COMPUTE_LOG_GRAPH_VERBOSE("Fusing node with ID : " << output_edge->producer_id()
-                                  << " with Activation Layer node with ID : " << output_edge->consumer_id() << std::endl);
+                                                           << " with Activation Layer node with ID : "
+                                                           << output_edge->consumer_id() << std::endl);
 
     // Prevent fusion if fused node has an output accessor
-    if(n_node->output(0)->accessor() == nullptr)
+    if (n_node->output(0)->accessor() == nullptr)
     {
         // Set activation info to fused node
         n_node->set_fused_activation(act_node->activation_info());
@@ -257,7 +269,8 @@
     }
     else
     {
-        ARM_COMPUTE_LOG_GRAPH_VERBOSE("Prevented fusion of node with activation due to the presence of an output accessor\n");
+        ARM_COMPUTE_LOG_GRAPH_VERBOSE(
+            "Prevented fusion of node with activation due to the presence of an output accessor\n");
     }
 }
 
@@ -268,8 +281,8 @@
     auto *conv_node = arm_compute::utils::cast::polymorphic_downcast<N *>(output_edge->consumer());
 
     const Edge *input_edge = pad_node->input_edge(0);
-    if(input_edge != nullptr && input_edge->tensor() != nullptr && pad_node->output(0)->accessor() == nullptr
-       && pad_node->pad_value().get<float>() == 0.0)
+    if (input_edge != nullptr && input_edge->tensor() != nullptr && pad_node->output(0)->accessor() == nullptr &&
+        pad_node->pad_value().get<float>() == 0.0)
     {
         const DataLayout  layout       = input_edge->tensor()->desc().layout;
         const PaddingList padding_list = pad_node->padding();
@@ -280,18 +293,14 @@
         const PaddingInfo pad_w = width_index < padding_list.size() ? padding_list[width_index] : PaddingInfo(0, 0);
         const PaddingInfo pad_h = height_index < padding_list.size() ? padding_list[height_index] : PaddingInfo(0, 0);
 
-        if(is_padding_in_height_or_width(layout, padding_list))
+        if (is_padding_in_height_or_width(layout, padding_list))
         {
             // Add paddings to the convolution node
             const PadStrideInfo conv_info = conv_node->convolution_info();
-            const PadStrideInfo new_conv_info(
-                conv_info.stride().first,
-                conv_info.stride().second,
-                conv_info.pad_left() + pad_w.first,
-                conv_info.pad_right() + pad_w.second,
-                conv_info.pad_top() + pad_h.first,
-                conv_info.pad_bottom() + pad_h.second,
-                conv_info.round());
+            const PadStrideInfo new_conv_info(conv_info.stride().first, conv_info.stride().second,
+                                              conv_info.pad_left() + pad_w.first, conv_info.pad_right() + pad_w.second,
+                                              conv_info.pad_top() + pad_h.first, conv_info.pad_bottom() + pad_h.second,
+                                              conv_info.round());
             conv_node->set_convolution_info(new_conv_info);
 
             // Update drivers of the convolution node
@@ -299,7 +308,7 @@
             g.remove_node(pad_node->id());
 
             // Update fused node inputs
-            for(auto &driver_node : pad_driver_nodes)
+            for (auto &driver_node : pad_driver_nodes)
             {
                 g.add_connection(driver_node.node_id, driver_node.index, conv_node->id(), 0);
             }
@@ -308,22 +317,23 @@
 }
 
 template <typename N1, typename N2, typename F, typename... Args>
-void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&...optional_arguments)
 {
     // Note that fused nodes may be added to the end of the node list.
     // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing.
     // This is intentional as it probes the newly added fused nodes for further fusing opportunities.
-    for(unsigned int i = 0; i < g.nodes().size(); ++i)
+    for (unsigned int i = 0; i < g.nodes().size(); ++i)
     {
         auto node = g.node(i);
         // Check if the node is of type N1 and not a branching node
-        if(node && node->type() == N1::node_type && node->output_edges().size() == 1)
+        if (node && node->type() == N1::node_type && node->output_edges().size() == 1)
         {
             const auto output_edge_id = *node->output_edges().begin();
             const auto output_edge    = g.edge(output_edge_id);
 
             // Check if following node is a type N2 node
-            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
+            if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) &&
+                (output_edge->consumer()->type() == N2::node_type) && prec(*output_edge->producer()))
             {
                 fuse_fcn(g, output_edge, optional_arguments...);
             }
@@ -332,22 +342,22 @@
 }
 
 template <typename N1, typename F, typename... Args>
-void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&... optional_arguments)
+void fuse_layer(Graph &g, std::function<bool(INode &)> const &prec, const F fuse_fcn, Args &&...optional_arguments)
 {
     // Note that fused nodes may be added to the end of the node list.
     // Instead of only looping over the original list of nodes, we loop over the current node list which could be growing.
     // This is intentional as it probes the newly added fused nodes for further fusing opportunities.
-    for(unsigned int i = 0; i < g.nodes().size(); ++i)
+    for (unsigned int i = 0; i < g.nodes().size(); ++i)
     {
         auto node = g.node(i);
         // Check if the node is of type N1 and not a branching node
-        if(node && node->type() == N1::node_type && node->output_edges().size() == 1)
+        if (node && node->type() == N1::node_type && node->output_edges().size() == 1)
         {
             const auto output_edge_id = *node->output_edges().begin();
             const auto output_edge    = g.edge(output_edge_id);
 
             // Check if it's the correct target
-            if((output_edge != nullptr) && (output_edge->consumer() != nullptr) && prec(*output_edge->producer()))
+            if ((output_edge != nullptr) && (output_edge->consumer() != nullptr) && prec(*output_edge->producer()))
             {
                 fuse_fcn(g, output_edge, i, optional_arguments...);
             }
@@ -369,30 +379,24 @@
 void NodeFusionMutator::mutate(Graph &g)
 {
     // Supported activations when fusing
-    const std::set<Activation> supported_fused_activations = { Activation::ABS, Activation::BOUNDED_RELU, Activation::ELU,
-                                                               Activation::HARD_SWISH, Activation::IDENTITY, Activation::LEAKY_RELU,
-                                                               Activation::LINEAR, Activation::LOGISTIC, Activation::LU_BOUNDED_RELU,
-                                                               Activation::RELU, Activation::SOFT_RELU, Activation::SQRT,
-                                                               Activation::SQUARE, Activation::TANH
-                                                             };
+    const std::set<Activation> supported_fused_activations = {
+        Activation::ABS,        Activation::BOUNDED_RELU, Activation::ELU,
+        Activation::HARD_SWISH, Activation::IDENTITY,     Activation::LEAKY_RELU,
+        Activation::LINEAR,     Activation::LOGISTIC,     Activation::LU_BOUNDED_RELU,
+        Activation::RELU,       Activation::SOFT_RELU,    Activation::SQRT,
+        Activation::SQUARE,     Activation::TANH};
 
     // Preconditions
-    auto empty_prec = [](INode &)
-    {
-        return true;
-    };
-    auto cl_target_prec = [](INode & n)
-    {
-        return n.assigned_target() == Target::CL;
-    };
-    auto qs8_prec = [&g](INode & n)
+    auto empty_prec     = [](INode &) { return true; };
+    auto cl_target_prec = [](INode &n) { return n.assigned_target() == Target::CL; };
+    auto qs8_prec       = [&g](INode &n)
     {
         ARM_COMPUTE_ERROR_ON(n.output(0) == nullptr);
 
         const auto output_edge_id = *n.output_edges().begin();
         const auto output_edge    = g.edge(output_edge_id);
         // To perform fusion the two nodes must have same output quantization information
-        const bool same_qinfo     = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
+        const bool same_qinfo = n.output(0)->desc().quant_info == output_edge->producer()->output(0)->desc().quant_info;
         const bool output_qasymm8 = n.output(0)->desc().data_type == DataType::QASYMM8;
 
         return (output_qasymm8 && same_qinfo) || !output_qasymm8;
@@ -400,16 +404,25 @@
 
     // Fusion mutations
 
-    detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
-    detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
-    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
-    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
-    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations);
-    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations);
+    detail::fuse_layer<PadLayerNode, ConvolutionLayerNode>(g, empty_prec,
+                                                           detail::fuse_pad_with_convolution<ConvolutionLayerNode>);
+    detail::fuse_layer<PadLayerNode, DepthwiseConvolutionLayerNode>(
+        g, empty_prec, detail::fuse_pad_with_convolution<DepthwiseConvolutionLayerNode>);
+    detail::fuse_layer<BatchNormalizationLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<BatchNormalizationLayerNode>, supported_fused_activations);
+    detail::fuse_layer<ConvolutionLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<ConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, ActivationLayerNode>(
+        g, qs8_prec, detail::fuse_node_with_activation<DepthwiseConvolutionLayerNode>, supported_fused_activations);
+    detail::fuse_layer<FullyConnectedLayerNode, ActivationLayerNode>(
+        g, empty_prec, detail::fuse_node_with_activation<FullyConnectedLayerNode>, supported_fused_activations);
+    detail::fuse_layer<EltwiseLayerNode, ActivationLayerNode>(
+        g, cl_target_prec, detail::fuse_node_with_activation<EltwiseLayerNode>, supported_fused_activations);
     // The fusion of BatchNormalizationLayer must occur after the fusion of ActivationLayer. Because FusedConvolutionBatchNormalizationNode assumes the BatchNormalization is already fused with activation, if any
-    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_convolution_with_batch_normalization);
-    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
+    detail::fuse_layer<ConvolutionLayerNode, BatchNormalizationLayerNode>(
+        g, empty_prec, detail::fuse_convolution_with_batch_normalization);
+    detail::fuse_layer<DepthwiseConvolutionLayerNode, BatchNormalizationLayerNode>(
+        g, empty_prec, detail::fuse_depthwise_convolution_with_batch_normalization);
 }
 } // namespace graph
 } // namespace arm_compute

diff --git a/src/graph/mutators/SplitLayerSubTensorMutator.cpp b/src/graph/mutators/SplitLayerSubTensorMutator.cpp
index 2c28a1a..533f894 100644
--- a/src/graph/mutators/SplitLayerSubTensorMutator.cpp
+++ b/src/graph/mutators/SplitLayerSubTensorMutator.cpp

@@ -23,12 +23,12 @@
  */
 #include "arm_compute/graph/mutators/SplitLayerSubTensorMutator.h"
 
-#include "arm_compute/graph/Graph.h"
-#include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/algorithms/TopologicalSort.h"
 #include "arm_compute/graph/backends/BackendRegistry.h"
+#include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/Logger.h"
 #include "arm_compute/graph/nodes/SplitLayerNode.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 #include "support/Iterable.h"
@@ -50,7 +50,7 @@
 void SplitLayerSubTensorMutator::mutate(Graph &g)
 {
     // Early exit if no Split layers exist in graph
-    if(g.nodes(NodeType::SplitLayer).empty())
+    if (g.nodes(NodeType::SplitLayer).empty())
     {
         return;
     }
@@ -59,23 +59,23 @@
     std::vector<NodeID> topological_sorted_node_ids = dfs(g);
 
     // Should be in reverse order of execution
-    for(auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
+    for (auto &node_id : arm_compute::utils::iterable::reverse_iterate(topological_sorted_node_ids))
     {
         INode *node = g.node(node_id);
-        if(node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
+        if (node != nullptr && node->type() == NodeType::SplitLayer && node->input(0) != nullptr)
         {
             // Get output tensor
             Tensor *input_tensor = node->input(0);
 
             // Check that all tensor have the same target and are valid
             bool is_valid = std::all_of(node->outputs().cbegin(), node->outputs().cend(),
-                                        [&](const TensorID & tid)
-            {
-                return (g.tensor(tid) != nullptr) && (g.tensor(tid)->desc().target == input_tensor->desc().target);
-            });
+                                        [&](const TensorID &tid) {
+                                            return (g.tensor(tid) != nullptr) &&
+                                                   (g.tensor(tid)->desc().target == input_tensor->desc().target);
+                                        });
 
             // Create subtensors
-            if(is_valid && is_target_supported(input_tensor->desc().target))
+            if (is_valid && is_target_supported(input_tensor->desc().target))
             {
                 ARM_COMPUTE_LOG_GRAPH_VERBOSE("Using sub-tensors for the node with ID : "
                                               << node->id() << " and name : " << node->name() << std::endl);
@@ -87,15 +87,18 @@
                 const bool         extend_parent = (axis < 2);
 
                 // Create sub-tensor handles
-                for(unsigned int i = 0; i < node->outputs().size(); ++i)
+                for (unsigned int i = 0; i < node->outputs().size(); ++i)
                 {
                     Tensor           *output_tensor = node->output(i);
                     const TensorShape output_shape  = output_tensor->desc().shape;
                     Coordinates       coords;
-                    std::tie(std::ignore, coords) = split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
+                    std::tie(std::ignore, coords) =
+                        split_node->compute_output_descriptor(input_tensor->desc(), num_splits, axis, i);
 
-                    backends::IDeviceBackend      &backend = backends::BackendRegistry::get().get_backend(output_tensor->desc().target);
-                    std::unique_ptr<ITensorHandle> handle  = backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
+                    backends::IDeviceBackend &backend =
+                        backends::BackendRegistry::get().get_backend(output_tensor->desc().target);
+                    std::unique_ptr<ITensorHandle> handle =
+                        backend.create_subtensor(input_tensor->handle(), output_shape, coords, extend_parent);
                     output_tensor->set_handle(std::move(handle));
                 }
             }

diff --git a/src/graph/mutators/SyntheticDataTypeMutator.cpp b/src/graph/mutators/SyntheticDataTypeMutator.cpp
index 74d040b..3dc2480 100644
--- a/src/graph/mutators/SyntheticDataTypeMutator.cpp
+++ b/src/graph/mutators/SyntheticDataTypeMutator.cpp

@@ -26,8 +26,8 @@
 #include "arm_compute/graph/GraphBuilder.h"
 #include "arm_compute/graph/ITensorAccessor.h"
 #include "arm_compute/graph/Logger.h"
-#include "arm_compute/graph/Utils.h"
 #include "arm_compute/graph/nodes/Nodes.h"
+#include "arm_compute/graph/Utils.h"
 
 #include "support/Cast.h"
 
@@ -62,14 +62,12 @@
  */
 bool is_mutation_supported(Graph &g)
 {
-    const std::set<NodeType> unsupported_node_types = { NodeType::DetectionOutputLayer,
-                                                        NodeType::NormalizationLayer,
-                                                        NodeType::PriorBoxLayer
-                                                      };
+    const std::set<NodeType> unsupported_node_types = {NodeType::DetectionOutputLayer, NodeType::NormalizationLayer,
+                                                       NodeType::PriorBoxLayer};
 
-    for(const auto &utype : unsupported_node_types)
+    for (const auto &utype : unsupported_node_types)
     {
-        if(!g.nodes(utype).empty())
+        if (!g.nodes(utype).empty())
         {
             return false;
         }
@@ -83,12 +81,12 @@
  */
 void remove_optimized_nodes(Graph &g)
 {
-    const std::set<NodeType> optimized_node_types = { NodeType::BatchNormalizationLayer };
+    const std::set<NodeType> optimized_node_types = {NodeType::BatchNormalizationLayer};
 
-    for(const auto &opt_type : optimized_node_types)
+    for (const auto &opt_type : optimized_node_types)
     {
         const std::vector<NodeID> opt_nodes_ids = g.nodes(opt_type);
-        for(const auto &node_id : opt_nodes_ids)
+        for (const auto &node_id : opt_nodes_ids)
         {
             INode *node = g.node(node_id);
 
@@ -108,7 +106,7 @@
             g.remove_node(node->id());
 
             // Update connections
-            for(auto &driving_node : driving_nodes)
+            for (auto &driving_node : driving_nodes)
             {
                 g.add_connection(producer->id(), producer_edge_id, driving_node.node_id, driving_node.index);
             }
@@ -123,11 +121,11 @@
 void convert_tensors(Graph &g, DataType data_type)
 {
     auto &tensors = g.tensors();
-    for(auto &tensor : tensors)
+    for (auto &tensor : tensors)
     {
-        if(tensor != nullptr)
+        if (tensor != nullptr)
         {
-            switch(data_type)
+            switch (data_type)
             {
                 case DataType::QASYMM8:
                 case DataType::QASYMM8_SIGNED:
@@ -156,7 +154,7 @@
 void convert_special_node(Graph &g, std::function<bool(INode *, Tensor *)> const &f)
 {
     const std::vector<NodeID> nodes_ids = g.nodes(NT::node_type);
-    for(const auto &nodes_id : nodes_ids)
+    for (const auto &nodes_id : nodes_ids)
     {
         INode *node = arm_compute::utils::cast::polymorphic_downcast<NT *>(g.node(nodes_id));
         ARM_COMPUTE_ERROR_ON(node == nullptr);
@@ -174,41 +172,41 @@
  */
 void convert_special_tensors(Graph &g)
 {
-    auto softmax_func = [](INode * node, Tensor * tensor)
+    auto softmax_func = [](INode *node, Tensor *tensor)
     {
         ARM_COMPUTE_UNUSED(node);
-        if(tensor->desc().data_type == DataType::QASYMM8)
+        if (tensor->desc().data_type == DataType::QASYMM8)
         {
             tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
         }
-        else if(tensor->desc().data_type == DataType::QASYMM8_SIGNED)
+        else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED)
         {
             tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128);
         }
         return true;
     };
 
-    auto act_func = [](INode * node, Tensor * tensor)
+    auto act_func = [](INode *node, Tensor *tensor)
     {
         auto *act_node = arm_compute::utils::cast::polymorphic_downcast<ActivationLayerNode *>(node);
-        if(tensor->desc().data_type == DataType::QASYMM8)
+        if (tensor->desc().data_type == DataType::QASYMM8)
         {
-            if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
+            if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 128);
             }
-            else if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, 0);
             }
         }
-        else if(tensor->desc().data_type == DataType::QASYMM8_SIGNED)
+        else if (tensor->desc().data_type == DataType::QASYMM8_SIGNED)
         {
-            if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
+            if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::TANH)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 128.f, 0);
             }
-            else if(act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
+            else if (act_node->activation_info().activation() == ActivationLayerInfo::ActivationFunction::LOGISTIC)
             {
                 tensor->desc().quant_info = QuantizationInfo(1.f / 256.f, -128);
             }
@@ -228,22 +226,19 @@
  */
 void handle_nodes_with_bias(Graph &g)
 {
-    const std::set<NodeType> special_node_types = { NodeType::ConvolutionLayer,
-                                                    NodeType::DeconvolutionLayer,
-                                                    NodeType::DepthwiseConvolutionLayer,
-                                                    NodeType::FullyConnectedLayer
-                                                  };
+    const std::set<NodeType> special_node_types = {NodeType::ConvolutionLayer, NodeType::DeconvolutionLayer,
+                                                   NodeType::DepthwiseConvolutionLayer, NodeType::FullyConnectedLayer};
 
-    for(const auto &spc_type : special_node_types)
+    for (const auto &spc_type : special_node_types)
     {
         const std::vector<NodeID> scp_nodes_ids = g.nodes(spc_type);
-        for(const auto &node_id : scp_nodes_ids)
+        for (const auto &node_id : scp_nodes_ids)
         {
             INode *node = g.node(node_id);
-            if(node != nullptr)
+            if (node != nullptr)
             {
                 Tensor *tensor = node->input(2);
-                if(tensor != nullptr)
+                if (tensor != nullptr)
                 {
                     tensor->desc().data_type = DataType::S32;
                 }
@@ -253,8 +248,8 @@
                     params.name = params.name.empty() ? "" : params.name + "Bias";
 
                     TensorDescriptor b_desc = node->input(1)->desc();
-                    auto             depth  = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)];
-                    b_desc.shape            = TensorShape(depth);
+                    auto depth   = b_desc.shape[get_dimension_idx(b_desc.layout, DataLayoutDimension::BATCHES)];
+                    b_desc.shape = TensorShape(depth);
 
                     auto accessor = std::make_unique<EmptyAccessor>();
                     auto b_nid    = GraphBuilder::add_const_node(g, params, b_desc, std::move(accessor));
@@ -266,8 +261,7 @@
 }
 } // namespace
 
-SyntheticDataTypeMutator::SyntheticDataTypeMutator(DataType mutate_type)
-    : _mutate_type{ mutate_type }
+SyntheticDataTypeMutator::SyntheticDataTypeMutator(DataType mutate_type) : _mutate_type{mutate_type}
 {
 }
 
@@ -283,7 +277,7 @@
 
 void SyntheticDataTypeMutator::mutate(Graph &g)
 {
-    if(is_mutation_supported(g))
+    if (is_mutation_supported(g))
     {
         // Remove nodes that get optimized out (e.g. BatchNorm)
         remove_optimized_nodes(g);

diff --git a/src/graph/nodes/ActivationLayerNode.cpp b/src/graph/nodes/ActivationLayerNode.cpp
index cf65d83..1773afc 100644
--- a/src/graph/nodes/ActivationLayerNode.cpp
+++ b/src/graph/nodes/ActivationLayerNode.cpp

@@ -44,7 +44,7 @@
 
 bool ActivationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -63,7 +63,7 @@
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
     TensorDescriptor output_info = src->desc();
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }

diff --git a/src/graph/nodes/ArgMinMaxLayerNode.cpp b/src/graph/nodes/ArgMinMaxLayerNode.cpp
index 63163b9..5adebc9 100644
--- a/src/graph/nodes/ArgMinMaxLayerNode.cpp
+++ b/src/graph/nodes/ArgMinMaxLayerNode.cpp

@@ -23,16 +23,18 @@
  */
 #include "arm_compute/graph/nodes/ArgMinMaxLayerNode.h"
 
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-ArgMinMaxLayerNode::ArgMinMaxLayerNode(ReductionOperation op, unsigned int axis, DataType out_data_type, QuantizationInfo out_quant_info)
+ArgMinMaxLayerNode::ArgMinMaxLayerNode(ReductionOperation op,
+                                       unsigned int       axis,
+                                       DataType           out_data_type,
+                                       QuantizationInfo   out_quant_info)
     : _op(op), _axis(axis), _out_data_type(out_data_type), _out_quant_info(std::move(out_quant_info))
 {
     _input_edges.resize(1, EmptyEdgeID);
@@ -56,7 +58,7 @@
 
 bool ArgMinMaxLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -75,17 +77,18 @@
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
     TensorDescriptor output_info = src->desc();
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
 
-    if(_out_data_type != DataType::UNKNOWN)
+    if (_out_data_type != DataType::UNKNOWN)
     {
         output_info.data_type = _out_data_type;
     }
 
-    TensorShape output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, false);
+    TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, false);
     output_info.set_shape(output_shape);
 
     return output_info;

diff --git a/src/graph/nodes/BatchNormalizationLayerNode.cpp b/src/graph/nodes/BatchNormalizationLayerNode.cpp
index ceca0e2..c317123 100644
--- a/src/graph/nodes/BatchNormalizationLayerNode.cpp
+++ b/src/graph/nodes/BatchNormalizationLayerNode.cpp

@@ -55,7 +55,7 @@
 
 bool BatchNormalizationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -86,4 +86,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
index f3f4f91..8e52174 100644
--- a/src/graph/nodes/BoundingBoxTransformLayerNode.cpp
+++ b/src/graph/nodes/BoundingBoxTransformLayerNode.cpp

@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/BoundingBoxTransformLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info)
-    : _bbox_info(info)
+BoundingBoxTransformLayerNode::BoundingBoxTransformLayerNode(BoundingBoxTransformInfo &info) : _bbox_info(info)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -46,7 +44,7 @@
 
 bool BoundingBoxTransformLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/ChannelShuffleLayerNode.cpp b/src/graph/nodes/ChannelShuffleLayerNode.cpp
index 5102e4b..3cb9e23 100644
--- a/src/graph/nodes/ChannelShuffleLayerNode.cpp
+++ b/src/graph/nodes/ChannelShuffleLayerNode.cpp

@@ -30,8 +30,7 @@
 {
 namespace graph
 {
-ChannelShuffleLayerNode::ChannelShuffleLayerNode(unsigned int num_groups)
-    : _num_groups(num_groups)
+ChannelShuffleLayerNode::ChannelShuffleLayerNode(unsigned int num_groups) : _num_groups(num_groups)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -44,7 +43,7 @@
 
 bool ChannelShuffleLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -75,4 +74,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/ConcatenateLayerNode.cpp b/src/graph/nodes/ConcatenateLayerNode.cpp
index 3f3c70f..8e5393a 100644
--- a/src/graph/nodes/ConcatenateLayerNode.cpp
+++ b/src/graph/nodes/ConcatenateLayerNode.cpp

@@ -24,17 +24,17 @@
 #include "arm_compute/graph/nodes/ConcatenateLayerNode.h"
 
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 #include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-ConcatenateLayerNode::ConcatenateLayerNode(unsigned int total_nodes, descriptors::ConcatLayerDescriptor concat_descriptor)
+ConcatenateLayerNode::ConcatenateLayerNode(unsigned int                       total_nodes,
+                                           descriptors::ConcatLayerDescriptor concat_descriptor)
     : _total_nodes(total_nodes), _concat_descriptor(std::move(concat_descriptor)), _is_enabled(true)
 {
     _input_edges.resize(_total_nodes, EmptyEdgeID);
@@ -73,7 +73,7 @@
     // Extract shapes
     std::vector<const TensorShape *> shapes;
     shapes.reserve(input_descriptors.size());
-    for(auto &input_descriptor : input_descriptors)
+    for (auto &input_descriptor : input_descriptors)
     {
         shapes.emplace_back(&input_descriptor.shape);
     }
@@ -85,7 +85,7 @@
 
 bool ConcatenateLayerNode::forward_descriptors()
 {
-    if(_outputs[0] != NullTensorID)
+    if (_outputs[0] != NullTensorID)
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -101,24 +101,22 @@
     ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
 
     // Check if all input tensors are set
-    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
-    {
-        return eid != EmptyEdgeID;
-    });
+    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges),
+                                          [](const EdgeID &eid) { return eid != EmptyEdgeID; });
 
     TensorDescriptor output_info = {};
 
-    if(are_all_inputs_set)
+    if (are_all_inputs_set)
     {
         std::vector<TensorDescriptor> inputs_descriptors;
-        for(unsigned int i = 0; i < _input_edges.size(); ++i)
+        for (unsigned int i = 0; i < _input_edges.size(); ++i)
         {
             const Tensor *t = _graph->tensor(input_id(i));
             ARM_COMPUTE_ERROR_ON(t == nullptr);
             inputs_descriptors.push_back(t->desc());
         }
         output_info = compute_output_descriptor(inputs_descriptors, _concat_descriptor.axis);
-        if(!_concat_descriptor.output_qinfo.empty())
+        if (!_concat_descriptor.output_qinfo.empty())
         {
             output_info.quant_info = _concat_descriptor.output_qinfo;
         }

diff --git a/src/graph/nodes/ConstNode.cpp b/src/graph/nodes/ConstNode.cpp
index eb96d63..6e8fbff 100644
--- a/src/graph/nodes/ConstNode.cpp
+++ b/src/graph/nodes/ConstNode.cpp

@@ -30,15 +30,14 @@
 {
 namespace graph
 {
-ConstNode::ConstNode(TensorDescriptor desc)
-    : _desc(std::move(desc))
+ConstNode::ConstNode(TensorDescriptor desc) : _desc(std::move(desc))
 {
     _outputs.resize(1, NullTensorID);
 }
 
 bool ConstNode::forward_descriptors()
 {
-    if(output_id(0) != NullTensorID)
+    if (output_id(0) != NullTensorID)
     {
         Tensor *t = output(0);
         ARM_COMPUTE_ERROR_ON(t == nullptr);

diff --git a/src/graph/nodes/ConvolutionLayerNode.cpp b/src/graph/nodes/ConvolutionLayerNode.cpp
index ee9dde9..f0263fc 100644
--- a/src/graph/nodes/ConvolutionLayerNode.cpp
+++ b/src/graph/nodes/ConvolutionLayerNode.cpp

@@ -37,7 +37,12 @@
                                            ConvolutionMethod method,
                                            FastMathHint      fast_math_hint,
                                            QuantizationInfo  out_quant_info)
-    : _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _out_quant_info(std::move(out_quant_info)), _fused_activation()
+    : _info(std::move(info)),
+      _num_groups(num_groups),
+      _method(method),
+      _fast_math_hint(fast_math_hint),
+      _out_quant_info(std::move(out_quant_info)),
+      _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -100,20 +105,22 @@
     const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
 
 bool ConvolutionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -132,7 +139,7 @@
     ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
 
     TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info);
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }

diff --git a/src/graph/nodes/DeconvolutionLayerNode.cpp b/src/graph/nodes/DeconvolutionLayerNode.cpp
index 3542d5a..2058ab2 100644
--- a/src/graph/nodes/DeconvolutionLayerNode.cpp
+++ b/src/graph/nodes/DeconvolutionLayerNode.cpp

@@ -56,20 +56,22 @@
     const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = deconvolution_output_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        deconvolution_output_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
 
 bool DeconvolutionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -89,7 +91,7 @@
 
     TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), descriptor.info);
 
-    if(!descriptor.out_quant_info.empty())
+    if (!descriptor.out_quant_info.empty())
     {
         output_info.set_quantization_info(descriptor.out_quant_info);
     }

diff --git a/src/graph/nodes/DepthToSpaceLayerNode.cpp b/src/graph/nodes/DepthToSpaceLayerNode.cpp
index b70ac56..0b914a0 100644
--- a/src/graph/nodes/DepthToSpaceLayerNode.cpp
+++ b/src/graph/nodes/DepthToSpaceLayerNode.cpp

@@ -32,8 +32,7 @@
 {
 namespace graph
 {
-DepthToSpaceLayerNode::DepthToSpaceLayerNode(int block_shape)
-    : _block_shape(block_shape)
+DepthToSpaceLayerNode::DepthToSpaceLayerNode(int block_shape) : _block_shape(block_shape)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -44,7 +43,8 @@
     return _block_shape;
 }
 
-TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor, int block_shape)
+TensorDescriptor DepthToSpaceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                  int                     block_shape)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
@@ -53,14 +53,15 @@
 
     // Set descriptor shape
     TensorDescriptor output_descriptor = input_descriptor;
-    output_descriptor.shape            = misc::shape_calculator::compute_depth_to_space_shape(input_shape, data_layout, block_shape);
+    output_descriptor.shape =
+        misc::shape_calculator::compute_depth_to_space_shape(input_shape, data_layout, block_shape);
 
     return output_descriptor;
 }
 
 bool DepthToSpaceLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
index 7de2016..92d7266 100644
--- a/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp
+++ b/src/graph/nodes/DepthwiseConvolutionLayerNode.cpp

@@ -32,9 +32,15 @@
 {
 namespace graph
 {
-DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo info, int depth_multiplier, DepthwiseConvolutionMethod method,
-                                                             QuantizationInfo out_quant_info)
-    : _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _out_quant_info(std::move(out_quant_info)), _fused_activation()
+DepthwiseConvolutionLayerNode::DepthwiseConvolutionLayerNode(PadStrideInfo              info,
+                                                             int                        depth_multiplier,
+                                                             DepthwiseConvolutionMethod method,
+                                                             QuantizationInfo           out_quant_info)
+    : _info(std::move(info)),
+      _depth_multiplier(depth_multiplier),
+      _method(method),
+      _out_quant_info(std::move(out_quant_info)),
+      _fused_activation()
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -89,20 +95,22 @@
     const unsigned int kernel_width   = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height  = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                input_channels * depth_multiplier);
 
     return output_descriptor;
 }
 
 bool DepthwiseConvolutionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -121,7 +129,7 @@
     ARM_COMPUTE_ERROR_ON(src == nullptr || weights == nullptr);
 
     TensorDescriptor output_info = compute_output_descriptor(src->desc(), weights->desc(), _info, _depth_multiplier);
-    if(!_out_quant_info.empty())
+    if (!_out_quant_info.empty())
     {
         output_info.quant_info = _out_quant_info;
     }
@@ -139,4 +147,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/DequantizationLayerNode.cpp b/src/graph/nodes/DequantizationLayerNode.cpp
index 14c4752..3ea0008 100644
--- a/src/graph/nodes/DequantizationLayerNode.cpp
+++ b/src/graph/nodes/DequantizationLayerNode.cpp

@@ -40,7 +40,7 @@
 
 bool DequantizationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -74,4 +74,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/DetectionOutputLayerNode.cpp b/src/graph/nodes/DetectionOutputLayerNode.cpp
index fc6f531..65ddd2f 100644
--- a/src/graph/nodes/DetectionOutputLayerNode.cpp
+++ b/src/graph/nodes/DetectionOutputLayerNode.cpp

@@ -32,8 +32,7 @@
 {
 namespace graph
 {
-DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info)
-    : _info(detection_info)
+DetectionOutputLayerNode::DetectionOutputLayerNode(DetectionOutputLayerInfo detection_info) : _info(detection_info)
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -47,7 +46,8 @@
 TensorDescriptor DetectionOutputLayerNode::compute_output_descriptor(const TensorDescriptor         &input_descriptor,
                                                                      const DetectionOutputLayerInfo &info)
 {
-    const unsigned int max_size = info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1);
+    const unsigned int max_size =
+        info.keep_top_k() * ((input_descriptor.shape.num_dimensions() > 1) ? input_descriptor.shape[1] : 1);
 
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(0, detection_size);
@@ -58,7 +58,8 @@
 
 bool DetectionOutputLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) &&
+        (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/DetectionPostProcessLayerNode.cpp b/src/graph/nodes/DetectionPostProcessLayerNode.cpp
index 2c5005a..af3fc03 100644
--- a/src/graph/nodes/DetectionPostProcessLayerNode.cpp
+++ b/src/graph/nodes/DetectionPostProcessLayerNode.cpp

@@ -46,10 +46,11 @@
 
 bool DetectionPostProcessLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID)
-       && (output_id(2) != NullTensorID) && (output_id(3) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) &&
+        (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) && (output_id(2) != NullTensorID) &&
+        (output_id(3) != NullTensorID))
     {
-        for(unsigned int i = 0; i < 4; ++i)
+        for (unsigned int i = 0; i < 4; ++i)
         {
             Tensor *dst = output(i);
             ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -68,7 +69,7 @@
     TensorDescriptor   output_desc;
     const unsigned int num_detected_box = _info.max_detections() * _info.max_classes_per_detection();
 
-    switch(idx)
+    switch (idx)
     {
         case 0:
             // Configure boxes output
@@ -101,4 +102,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/DummyNode.cpp b/src/graph/nodes/DummyNode.cpp
index 6fa9fba..b5f37bd 100644
--- a/src/graph/nodes/DummyNode.cpp
+++ b/src/graph/nodes/DummyNode.cpp

@@ -32,8 +32,7 @@
 {
 namespace graph
 {
-DummyNode::DummyNode(TensorShape shape)
-    : _shape(shape)
+DummyNode::DummyNode(TensorShape shape) : _shape(shape)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -41,7 +40,7 @@
 
 bool DummyNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -75,4 +74,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/EltwiseLayerNode.cpp b/src/graph/nodes/EltwiseLayerNode.cpp
index 4426e95..3f7a08e 100644
--- a/src/graph/nodes/EltwiseLayerNode.cpp
+++ b/src/graph/nodes/EltwiseLayerNode.cpp

@@ -31,8 +31,7 @@
 {
 namespace graph
 {
-EltwiseLayerNode::EltwiseLayerNode(const descriptors::EltwiseLayerDescriptor &descriptor)
-    : descriptor(descriptor)
+EltwiseLayerNode::EltwiseLayerNode(const descriptors::EltwiseLayerDescriptor &descriptor) : descriptor(descriptor)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -70,7 +69,7 @@
 
 bool EltwiseLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -97,7 +96,7 @@
 
     output_info.set_shape(out_shape);
 
-    if(!descriptor.out_quant_info.empty())
+    if (!descriptor.out_quant_info.empty())
     {
         output_info.set_quantization_info(descriptor.out_quant_info);
     }
@@ -134,7 +133,7 @@
 
 bool UnaryEltwiseLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -153,7 +152,7 @@
 
     auto output_info = src->desc();
 
-    if(!descriptor.out_quant_info.empty())
+    if (!descriptor.out_quant_info.empty())
     {
         output_info.set_quantization_info(descriptor.out_quant_info);
     }

diff --git a/src/graph/nodes/FlattenLayerNode.cpp b/src/graph/nodes/FlattenLayerNode.cpp
index 48519a1..952df2f 100644
--- a/src/graph/nodes/FlattenLayerNode.cpp
+++ b/src/graph/nodes/FlattenLayerNode.cpp

@@ -38,7 +38,7 @@
 
 bool FlattenLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -72,4 +72,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/FullyConnectedLayer.cpp b/src/graph/nodes/FullyConnectedLayer.cpp
index 6278227..1eed69d 100644
--- a/src/graph/nodes/FullyConnectedLayer.cpp
+++ b/src/graph/nodes/FullyConnectedLayer.cpp

@@ -21,18 +21,23 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
-
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/nodes/FullyConnectedLayerNode.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int num_outputs, QuantizationInfo out_quant_info, FullyConnectedLayerInfo fc_info, FastMathHint fast_math_hint)
-    : _num_outputs(num_outputs), _out_quant_info(std::move(out_quant_info)), _info(fc_info), _fast_math_hint(fast_math_hint)
+FullyConnectedLayerNode::FullyConnectedLayerNode(unsigned int            num_outputs,
+                                                 QuantizationInfo        out_quant_info,
+                                                 FullyConnectedLayerInfo fc_info,
+                                                 FastMathHint            fast_math_hint)
+    : _num_outputs(num_outputs),
+      _out_quant_info(std::move(out_quant_info)),
+      _info(fc_info),
+      _fast_math_hint(fast_math_hint)
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -60,11 +65,11 @@
     unsigned int num_weights    = 1;
     unsigned int num_dimensions = input_descriptor.shape.num_dimensions();
     // Ignore the batch dimension if there is one:
-    if(num_dimensions == 2 || num_dimensions == 4)
+    if (num_dimensions == 2 || num_dimensions == 4)
     {
         num_dimensions--;
     }
-    for(unsigned int i = 0; i < num_dimensions; i++)
+    for (unsigned int i = 0; i < num_dimensions; i++)
     {
         num_weights *= input_descriptor.shape[i];
     }
@@ -73,13 +78,13 @@
     weights_descriptor.shape            = TensorShape(num_weights, num_outputs);
 
     // If weights are tranposed, use tranposed shape
-    if(!fc_info.transpose_weights)
+    if (!fc_info.transpose_weights)
     {
         weights_descriptor.shape = TensorShape(num_outputs, num_weights);
     }
 
     // Set quantization info if present
-    if(!weights_quant_info.empty())
+    if (!weights_quant_info.empty())
     {
         weights_descriptor.quant_info = weights_quant_info;
     }
@@ -93,7 +98,7 @@
 {
     // Note: Only 1D batch space is supported at the moment
     unsigned int batches = input_descriptor.shape[1];
-    if(input_descriptor.shape.num_dimensions() > 2)
+    if (input_descriptor.shape.num_dimensions() > 2)
     {
         batches = input_descriptor.shape[3];
     }
@@ -103,7 +108,7 @@
     output_descriptor.shape            = TensorShape(num_outputs, batches);
 
     // Set quantization info if present
-    if(!out_quant_info.empty())
+    if (!out_quant_info.empty())
     {
         output_descriptor.quant_info = out_quant_info;
     }
@@ -118,7 +123,7 @@
 
 bool FullyConnectedLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -147,4 +152,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
index de995eb..9d37e84 100644
--- a/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp
+++ b/src/graph/nodes/FusedConvolutionBatchNormalizationNode.cpp

@@ -32,12 +32,18 @@
 {
 namespace graph
 {
-FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float epsilon, PadStrideInfo info,
-                                                                               unsigned int      num_groups,
-                                                                               ConvolutionMethod method,
-                                                                               FastMathHint      fast_math_hint,
+FusedConvolutionBatchNormalizationNode::FusedConvolutionBatchNormalizationNode(float               epsilon,
+                                                                               PadStrideInfo       info,
+                                                                               unsigned int        num_groups,
+                                                                               ConvolutionMethod   method,
+                                                                               FastMathHint        fast_math_hint,
                                                                                ActivationLayerInfo fused_activation)
-    : _epsilon(epsilon), _info(std::move(info)), _num_groups(num_groups), _method(method), _fast_math_hint(fast_math_hint), _fused_activation(fused_activation)
+    : _epsilon(epsilon),
+      _info(std::move(info)),
+      _num_groups(num_groups),
+      _method(method),
+      _fast_math_hint(fast_math_hint),
+      _fused_activation(fused_activation)
 {
     _input_edges.resize(7, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -88,9 +94,8 @@
     _fused_activation = fused_activation;
 }
 
-TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                   const TensorDescriptor &weights_descriptor,
-                                                                                   const PadStrideInfo    &info)
+TensorDescriptor FusedConvolutionBatchNormalizationNode::compute_output_descriptor(
+    const TensorDescriptor &input_descriptor, const TensorDescriptor &weights_descriptor, const PadStrideInfo &info)
 {
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
@@ -100,20 +105,22 @@
     const unsigned int kernel_width  = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), output_width);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), weights_descriptor.shape[3]);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                weights_descriptor.shape[3]);
 
     return output_descriptor;
 }
 
 bool FusedConvolutionBatchNormalizationNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
index c022450..c51641d 100644
--- a/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp
+++ b/src/graph/nodes/FusedDepthwiseConvolutionBatchNormalizationNode.cpp

@@ -32,18 +32,24 @@
 {
 namespace graph
 {
-FusedDepthwiseConvolutionBatchNormalizationNode::FusedDepthwiseConvolutionBatchNormalizationNode(float                      epsilon,
-                                                                                                 PadStrideInfo              info,
-                                                                                                 unsigned int               depth_multiplier,
-                                                                                                 DepthwiseConvolutionMethod method,
-                                                                                                 ActivationLayerInfo        fused_activation)
-    : _epsilon(epsilon), _info(std::move(info)), _depth_multiplier(depth_multiplier), _method(method), _fused_activation(fused_activation)
+FusedDepthwiseConvolutionBatchNormalizationNode::FusedDepthwiseConvolutionBatchNormalizationNode(
+    float                      epsilon,
+    PadStrideInfo              info,
+    unsigned int               depth_multiplier,
+    DepthwiseConvolutionMethod method,
+    ActivationLayerInfo        fused_activation)
+    : _epsilon(epsilon),
+      _info(std::move(info)),
+      _depth_multiplier(depth_multiplier),
+      _method(method),
+      _fused_activation(fused_activation)
 {
     _input_edges.resize(7, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
 }
 
-void FusedDepthwiseConvolutionBatchNormalizationNode::set_depthwise_convolution_method(DepthwiseConvolutionMethod method)
+void FusedDepthwiseConvolutionBatchNormalizationNode::set_depthwise_convolution_method(
+    DepthwiseConvolutionMethod method)
 {
     _method = method;
 }
@@ -78,10 +84,11 @@
     _fused_activation = fused_activation;
 }
 
-TensorDescriptor FusedDepthwiseConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                            const TensorDescriptor &weights_descriptor,
-                                                                                            const PadStrideInfo    &info,
-                                                                                            int                     depth_multiplier)
+TensorDescriptor
+FusedDepthwiseConvolutionBatchNormalizationNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+                                                                           const TensorDescriptor &weights_descriptor,
+                                                                           const PadStrideInfo    &info,
+                                                                           int                     depth_multiplier)
 {
     unsigned int output_width  = 0;
     unsigned int output_height = 0;
@@ -92,19 +99,22 @@
     const unsigned int kernel_width   = get_dimension_size(weights_descriptor, DataLayoutDimension::WIDTH);
     const unsigned int kernel_height  = get_dimension_size(weights_descriptor, DataLayoutDimension::HEIGHT);
 
-    std::tie(output_width, output_height) = scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
+    std::tie(output_width, output_height) =
+        scaled_dimensions(input_width, input_height, kernel_width, kernel_height, info);
 
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::WIDTH), output_width);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::HEIGHT), output_height);
-    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::CHANNEL), input_channels * depth_multiplier);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::HEIGHT),
+                                output_height);
+    output_descriptor.shape.set(get_dimension_idx(output_descriptor.layout, DataLayoutDimension::CHANNEL),
+                                input_channels * depth_multiplier);
 
     return output_descriptor;
 }
 
 bool FusedDepthwiseConvolutionBatchNormalizationNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/GenerateProposalsLayerNode.cpp b/src/graph/nodes/GenerateProposalsLayerNode.cpp
index 9f36862..1671a47 100644
--- a/src/graph/nodes/GenerateProposalsLayerNode.cpp
+++ b/src/graph/nodes/GenerateProposalsLayerNode.cpp

@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/GenerateProposalsLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info)
-    : _info(info)
+GenerateProposalsLayerNode::GenerateProposalsLayerNode(GenerateProposalsInfo &info) : _info(info)
 {
     _input_edges.resize(3, EmptyEdgeID);
     _outputs.resize(3, NullTensorID);
@@ -46,10 +44,10 @@
 
 bool GenerateProposalsLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) && (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID)
-       && (output_id(2) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (input_id(2) != NullTensorID) &&
+        (output_id(0) != NullTensorID) && (output_id(1) != NullTensorID) && (output_id(2) != NullTensorID))
     {
-        for(unsigned int i = 0; i < 3; ++i)
+        for (unsigned int i = 0; i < 3; ++i)
         {
             Tensor *dst = output(i);
             ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -68,7 +66,7 @@
     ARM_COMPUTE_ERROR_ON(src == nullptr);
     TensorDescriptor output_desc = src->desc();
 
-    switch(idx)
+    switch (idx)
     {
         case 0:
             // Configure proposals output

diff --git a/src/graph/nodes/InputNode.cpp b/src/graph/nodes/InputNode.cpp
index 072281f..7408bc2 100644
--- a/src/graph/nodes/InputNode.cpp
+++ b/src/graph/nodes/InputNode.cpp

@@ -30,15 +30,14 @@
 {
 namespace graph
 {
-InputNode::InputNode(TensorDescriptor desc)
-    : _desc(std::move(desc))
+InputNode::InputNode(TensorDescriptor desc) : _desc(std::move(desc))
 {
     _outputs.resize(1, NullTensorID);
 }
 
 bool InputNode::forward_descriptors()
 {
-    if(output_id(0) != NullTensorID)
+    if (output_id(0) != NullTensorID)
     {
         Tensor *t = output(0);
         ARM_COMPUTE_ERROR_ON(t == nullptr);

diff --git a/src/graph/nodes/L2NormalizeLayerNode.cpp b/src/graph/nodes/L2NormalizeLayerNode.cpp
index 0c35a33..1a57cf0 100644
--- a/src/graph/nodes/L2NormalizeLayerNode.cpp
+++ b/src/graph/nodes/L2NormalizeLayerNode.cpp

@@ -30,18 +30,15 @@
 {
 namespace graph
 {
-L2NormalizeLayerNode::L2NormalizeLayerNode()
-    : L2NormalizeLayerNode(0, 1e-12f)
+L2NormalizeLayerNode::L2NormalizeLayerNode() : L2NormalizeLayerNode(0, 1e-12f)
 {
 }
 
-L2NormalizeLayerNode::L2NormalizeLayerNode(int axis)
-    : L2NormalizeLayerNode(axis, 1e-12f)
+L2NormalizeLayerNode::L2NormalizeLayerNode(int axis) : L2NormalizeLayerNode(axis, 1e-12f)
 {
 }
 
-L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon)
-    : _axis(axis), _epsilon(epsilon)
+L2NormalizeLayerNode::L2NormalizeLayerNode(int axis, float epsilon) : _axis(axis), _epsilon(epsilon)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -49,7 +46,7 @@
 
 bool L2NormalizeLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -92,4 +89,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/NormalizationLayerNode.cpp b/src/graph/nodes/NormalizationLayerNode.cpp
index eaa1bcf..b18bb7d 100644
--- a/src/graph/nodes/NormalizationLayerNode.cpp
+++ b/src/graph/nodes/NormalizationLayerNode.cpp

@@ -31,8 +31,7 @@
 {
 namespace graph
 {
-NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info)
-    : _info(norm_info)
+NormalizationLayerNode::NormalizationLayerNode(NormalizationLayerInfo norm_info) : _info(norm_info)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -45,7 +44,7 @@
 
 bool NormalizationLayerNode::forward_descriptors()
 {
-    if(input_id(0) != NullTensorID && (output_id(0) != NullTensorID))
+    if (input_id(0) != NullTensorID && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -76,4 +75,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
index 113d0a5..cac9660 100644
--- a/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp
+++ b/src/graph/nodes/NormalizePlanarYUVLayerNode.cpp

@@ -39,7 +39,7 @@
 
 bool NormalizePlanarYUVLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/PReluLayerNode.cpp b/src/graph/nodes/PReluLayerNode.cpp
index 378c18e..2b50fe9 100644
--- a/src/graph/nodes/PReluLayerNode.cpp
+++ b/src/graph/nodes/PReluLayerNode.cpp

@@ -38,7 +38,7 @@
 
 bool PReluLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/PadLayerNode.cpp b/src/graph/nodes/PadLayerNode.cpp
index 6424370..336e7de 100644
--- a/src/graph/nodes/PadLayerNode.cpp
+++ b/src/graph/nodes/PadLayerNode.cpp

@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/PadLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-PadLayerNode::PadLayerNode(const PaddingList &padding, PixelValue pad_value)
-    : _padding(padding), _pad_value(pad_value)
+PadLayerNode::PadLayerNode(const PaddingList &padding, PixelValue pad_value) : _padding(padding), _pad_value(pad_value)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -51,7 +49,7 @@
 
 bool PadLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -71,7 +69,7 @@
 
     TensorDescriptor  output_desc = src->desc();
     const TensorShape input_shape = src->desc().shape;
-    for(size_t dim = 0; dim < _padding.size(); ++dim)
+    for (size_t dim = 0; dim < _padding.size(); ++dim)
     {
         output_desc.shape.set(dim, _padding[dim].first + input_shape[dim] + _padding[dim].second);
     }

diff --git a/src/graph/nodes/PermuteLayerNode.cpp b/src/graph/nodes/PermuteLayerNode.cpp
index b311ee1..db53722 100644
--- a/src/graph/nodes/PermuteLayerNode.cpp
+++ b/src/graph/nodes/PermuteLayerNode.cpp

@@ -23,17 +23,15 @@
  */
 #include "arm_compute/graph/nodes/PermuteLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-PermuteLayerNode::PermuteLayerNode(PermutationVector perm, DataLayout layout)
-    : _perm(perm), _layout(layout)
+PermuteLayerNode::PermuteLayerNode(PermutationVector perm, DataLayout layout) : _perm(perm), _layout(layout)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -46,7 +44,7 @@
 
 bool PermuteLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -66,7 +64,7 @@
 
     TensorDescriptor output_desc = src->desc();
     permute(output_desc.shape, _perm);
-    if(_layout != DataLayout::UNKNOWN)
+    if (_layout != DataLayout::UNKNOWN)
     {
         output_desc.layout = _layout;
     }
@@ -84,4 +82,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/PoolingLayerNode.cpp b/src/graph/nodes/PoolingLayerNode.cpp
index 4ecf924..ac954ac 100644
--- a/src/graph/nodes/PoolingLayerNode.cpp
+++ b/src/graph/nodes/PoolingLayerNode.cpp

@@ -32,8 +32,7 @@
 {
 namespace graph
 {
-PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info)
-    : _info(std::move(pool_info))
+PoolingLayerNode::PoolingLayerNode(PoolingLayerInfo pool_info) : _info(std::move(pool_info))
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -55,7 +54,8 @@
     const unsigned int pool_size_x  = info.is_global_pooling ? input_width : info.pool_size.width;
     const unsigned int pool_size_y  = info.is_global_pooling ? input_height : info.pool_size.height;
 
-    std::tie(pooled_width, pooled_height) = scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info);
+    std::tie(pooled_width, pooled_height) =
+        scaled_dimensions(input_width, input_height, pool_size_x, pool_size_y, info.pad_stride_info);
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
@@ -67,7 +67,7 @@
 
 bool PoolingLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -98,4 +98,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/PrintLayerNode.cpp b/src/graph/nodes/PrintLayerNode.cpp
index da408d8..82a3400 100644
--- a/src/graph/nodes/PrintLayerNode.cpp
+++ b/src/graph/nodes/PrintLayerNode.cpp

@@ -32,7 +32,9 @@
 {
 namespace graph
 {
-PrintLayerNode::PrintLayerNode(std::ostream &stream, const IOFormatInfo &format_info, const std::function<ITensor *(ITensor *)> transform)
+PrintLayerNode::PrintLayerNode(std::ostream                             &stream,
+                               const IOFormatInfo                       &format_info,
+                               const std::function<ITensor *(ITensor *)> transform)
     : _stream(stream), _format_info(format_info), _transform(transform)
 {
     _input_edges.resize(1, EmptyEdgeID);
@@ -56,7 +58,7 @@
 
 bool PrintLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -88,4 +90,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/PriorBoxLayerNode.cpp b/src/graph/nodes/PriorBoxLayerNode.cpp
index f017ead..5ffb173 100644
--- a/src/graph/nodes/PriorBoxLayerNode.cpp
+++ b/src/graph/nodes/PriorBoxLayerNode.cpp

@@ -32,8 +32,7 @@
 {
 namespace graph
 {
-PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info)
-    : _info(std::move(prior_info))
+PriorBoxLayerNode::PriorBoxLayerNode(PriorBoxLayerInfo prior_info) : _info(std::move(prior_info))
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -44,7 +43,7 @@
     return _info;
 }
 
-TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
+TensorDescriptor PriorBoxLayerNode::compute_output_descriptor(const TensorDescriptor  &input_descriptor,
                                                               const PriorBoxLayerInfo &info)
 {
     const unsigned int layer_width  = get_dimension_size(input_descriptor, DataLayoutDimension::WIDTH);
@@ -61,7 +60,7 @@
 
 bool PriorBoxLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/QuantizationLayerNode.cpp b/src/graph/nodes/QuantizationLayerNode.cpp
index 4906808..0dd2da9 100644
--- a/src/graph/nodes/QuantizationLayerNode.cpp
+++ b/src/graph/nodes/QuantizationLayerNode.cpp

@@ -47,7 +47,7 @@
 
 bool QuantizationLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/ROIAlignLayerNode.cpp b/src/graph/nodes/ROIAlignLayerNode.cpp
index 6289181..5909335 100644
--- a/src/graph/nodes/ROIAlignLayerNode.cpp
+++ b/src/graph/nodes/ROIAlignLayerNode.cpp

@@ -24,17 +24,15 @@
 
 #include "arm_compute/graph/nodes/ROIAlignLayerNode.h"
 
+#include "arm_compute/core/Helpers.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 
-#include "arm_compute/core/Helpers.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info)
-    : _pool_info(pool_info)
+ROIAlignLayerNode::ROIAlignLayerNode(ROIPoolingLayerInfo &pool_info) : _pool_info(pool_info)
 {
     _input_edges.resize(2, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -47,7 +45,7 @@
 
 bool ROIAlignLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (input_id(1) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -92,4 +90,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/ReductionLayerNode.cpp b/src/graph/nodes/ReductionLayerNode.cpp
index 0e93039..965c1ba 100644
--- a/src/graph/nodes/ReductionLayerNode.cpp
+++ b/src/graph/nodes/ReductionLayerNode.cpp

@@ -56,7 +56,7 @@
 
 bool ReductionLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -74,8 +74,9 @@
     const Tensor *src = input(0);
     ARM_COMPUTE_ERROR_ON(src == nullptr);
 
-    TensorDescriptor output_info  = src->desc();
-    TensorShape      output_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, _keep_dims);
+    TensorDescriptor output_info = src->desc();
+    TensorShape      output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(output_info.shape, _axis, _keep_dims);
     output_info.set_shape(output_shape);
 
     return output_info;
@@ -91,4 +92,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/ReorgLayerNode.cpp b/src/graph/nodes/ReorgLayerNode.cpp
index e693e4b..251a4ea 100644
--- a/src/graph/nodes/ReorgLayerNode.cpp
+++ b/src/graph/nodes/ReorgLayerNode.cpp

@@ -31,8 +31,7 @@
 {
 namespace graph
 {
-ReorgLayerNode::ReorgLayerNode(int stride)
-    : _stride(stride)
+ReorgLayerNode::ReorgLayerNode(int stride) : _stride(stride)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -51,20 +50,22 @@
 
     ARM_COMPUTE_ERROR_ON(stride <= 0);
     ARM_COMPUTE_ERROR_ON_MSG((input_width % stride != 0), "The width of the input tensor must be a multiple of stride");
-    ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0), "The height of the input tensor must be a multiple of stride");
+    ARM_COMPUTE_ERROR_ON_MSG((input_height % stride != 0),
+                             "The height of the input tensor must be a multiple of stride");
 
     const DataLayout data_layout       = input_descriptor.layout;
     TensorDescriptor output_descriptor = input_descriptor;
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::WIDTH), input_width / stride);
     output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::HEIGHT), input_height / stride);
-    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL), input_channel * stride * stride);
+    output_descriptor.shape.set(get_dimension_idx(data_layout, DataLayoutDimension::CHANNEL),
+                                input_channel * stride * stride);
 
     return output_descriptor;
 }
 
 bool ReorgLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -95,4 +96,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/ReshapeLayer.cpp b/src/graph/nodes/ReshapeLayer.cpp
index a6354d0..ce6bf9b 100644
--- a/src/graph/nodes/ReshapeLayer.cpp
+++ b/src/graph/nodes/ReshapeLayer.cpp

@@ -21,17 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/graph/nodes/ReshapeLayerNode.h"
-
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
+#include "arm_compute/graph/nodes/ReshapeLayerNode.h"
 
 namespace arm_compute
 {
 namespace graph
 {
-ReshapeLayerNode::ReshapeLayerNode(TensorShape shape)
-    : _shape(shape)
+ReshapeLayerNode::ReshapeLayerNode(TensorShape shape) : _shape(shape)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -39,7 +37,7 @@
 
 bool ReshapeLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -73,4 +71,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/ResizeLayerNode.cpp b/src/graph/nodes/ResizeLayerNode.cpp
index 2a94bf6..292b2c6 100644
--- a/src/graph/nodes/ResizeLayerNode.cpp
+++ b/src/graph/nodes/ResizeLayerNode.cpp

@@ -50,7 +50,7 @@
 
 bool ResizeLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -88,4 +88,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/SliceLayerNode.cpp b/src/graph/nodes/SliceLayerNode.cpp
index b7655b9..eb877d9 100644
--- a/src/graph/nodes/SliceLayerNode.cpp
+++ b/src/graph/nodes/SliceLayerNode.cpp

@@ -32,8 +32,7 @@
 {
 namespace graph
 {
-SliceLayerNode::SliceLayerNode(const Coordinates &starts, const Coordinates &ends)
-    : _starts(starts), _ends(ends)
+SliceLayerNode::SliceLayerNode(const Coordinates &starts, const Coordinates &ends) : _starts(starts), _ends(ends)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -50,19 +49,20 @@
 }
 
 TensorDescriptor SliceLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                           const Coordinates &starts, const Coordinates &ends)
+                                                           const Coordinates      &starts,
+                                                           const Coordinates      &ends)
 {
     using namespace arm_compute::helpers::tensor_transform;
 
     TensorDescriptor output_desc = input_descriptor;
-    output_desc.shape            = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends);
+    output_desc.shape = arm_compute::misc::shape_calculator::compute_slice_shape(input_descriptor.shape, starts, ends);
 
     return output_desc;
 }
 
 bool SliceLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/nodes/SoftmaxLayerNode.cpp b/src/graph/nodes/SoftmaxLayerNode.cpp
index 0311669..4beac81 100644
--- a/src/graph/nodes/SoftmaxLayerNode.cpp
+++ b/src/graph/nodes/SoftmaxLayerNode.cpp

@@ -31,8 +31,7 @@
 {
 namespace graph
 {
-SoftmaxLayerNode::SoftmaxLayerNode(float beta)
-    : _beta(beta)
+SoftmaxLayerNode::SoftmaxLayerNode(float beta) : _beta(beta)
 {
     _input_edges.resize(1, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -45,7 +44,7 @@
 
 bool SoftmaxLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -79,4 +78,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/SplitLayerNode.cpp b/src/graph/nodes/SplitLayerNode.cpp
index 31931c3..dfb6624 100644
--- a/src/graph/nodes/SplitLayerNode.cpp
+++ b/src/graph/nodes/SplitLayerNode.cpp

@@ -49,8 +49,8 @@
     return _axis;
 }
 
-std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descriptor(const TensorDescriptor &input_descriptor,
-                                                                                   unsigned int num_splits, int axis, unsigned int idx)
+std::pair<TensorDescriptor, Coordinates> SplitLayerNode::compute_output_descriptor(
+    const TensorDescriptor &input_descriptor, unsigned int num_splits, int axis, unsigned int idx)
 {
     // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
     int              num_dimension = static_cast<int32_t>(input_descriptor.shape.num_dimensions());
@@ -58,7 +58,7 @@
     Coordinates      coords;
     TensorDescriptor output_descriptor = input_descriptor;
     int              split_size        = input_descriptor.shape[tmp_axis] / num_splits;
-    if(_size_splits.empty())
+    if (_size_splits.empty())
     {
         output_descriptor.shape.set(tmp_axis, split_size);
         coords.set(tmp_axis, idx * split_size);
@@ -66,15 +66,15 @@
     else
     {
         int split_size = _size_splits[idx];
-        if(split_size == -1)
+        if (split_size == -1)
         {
             split_size = input_descriptor.shape[tmp_axis];
-            for(unsigned int i = 0; i < _size_splits.size() - 1; ++i)
+            for (unsigned int i = 0; i < _size_splits.size() - 1; ++i)
                 split_size -= _size_splits[i];
         }
         output_descriptor.shape.set(tmp_axis, split_size);
         int coord_value = 0;
-        for(unsigned int i = 0; i < idx; ++i)
+        for (unsigned int i = 0; i < idx; ++i)
             coord_value += _size_splits[i];
         coords.set(tmp_axis, coord_value);
     }
@@ -84,12 +84,12 @@
 
 bool SplitLayerNode::forward_descriptors()
 {
-    if(input_id(0) != NullTensorID)
+    if (input_id(0) != NullTensorID)
     {
         validate();
-        for(unsigned int i = 0; i < _outputs.size(); ++i)
+        for (unsigned int i = 0; i < _outputs.size(); ++i)
         {
-            if(output_id(i) != NullTensorID)
+            if (output_id(i) != NullTensorID)
             {
                 Tensor *dst_i = output(i);
                 ARM_COMPUTE_ERROR_ON(dst_i == nullptr);
@@ -117,10 +117,10 @@
     int tmp_axis      = wrap_around(_axis, num_dimension);
 
     int split_size = (_size_splits.empty()) ? (input_descriptor.shape[tmp_axis] / _num_splits) : _size_splits[idx];
-    if(split_size == -1)
+    if (split_size == -1)
     {
         split_size = input_descriptor.shape[tmp_axis];
-        for(unsigned int i = 0; i < _size_splits.size() - 1; ++i)
+        for (unsigned int i = 0; i < _size_splits.size() - 1; ++i)
             split_size -= _size_splits[i];
     }
     output_descriptor.shape.set(tmp_axis, split_size);
@@ -138,7 +138,7 @@
     // Handle negative axis, negative index is used to specify axis from the end (e.g. -1 for the last axis).
     int tmp_axis = wrap_around(_axis, num_dimension);
 
-    if(_size_splits.empty())
+    if (_size_splits.empty())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->desc().shape[tmp_axis] % _num_splits, "Split should be exact");
     }
@@ -156,4 +156,4 @@
     v.visit(*this);
 }
 } // namespace graph
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/graph/nodes/StackLayerNode.cpp b/src/graph/nodes/StackLayerNode.cpp
index f292b33..031d8fc 100644
--- a/src/graph/nodes/StackLayerNode.cpp
+++ b/src/graph/nodes/StackLayerNode.cpp

@@ -25,18 +25,16 @@
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/graph/Graph.h"
 #include "arm_compute/graph/INodeVisitor.h"
 #include "arm_compute/graph/Utils.h"
 
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-
 namespace arm_compute
 {
 namespace graph
 {
-StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis)
-    : _total_nodes(total_nodes), _axis(axis)
+StackLayerNode::StackLayerNode(unsigned int total_nodes, int axis) : _total_nodes(total_nodes), _axis(axis)
 {
     _input_edges.resize(_total_nodes, EmptyEdgeID);
     _outputs.resize(1, NullTensorID);
@@ -64,7 +62,7 @@
 
 bool StackLayerNode::forward_descriptors()
 {
-    if(_outputs[0] != NullTensorID)
+    if (_outputs[0] != NullTensorID)
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);
@@ -80,17 +78,15 @@
     ARM_COMPUTE_ERROR_ON(idx >= _outputs.size());
 
     // Check if all input tensors are set
-    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges), [](const EdgeID & eid)
-    {
-        return eid != EmptyEdgeID;
-    });
+    bool are_all_inputs_set = std::all_of(std::begin(_input_edges), std::end(_input_edges),
+                                          [](const EdgeID &eid) { return eid != EmptyEdgeID; });
 
     TensorDescriptor output_info = {};
 
-    if(are_all_inputs_set)
+    if (are_all_inputs_set)
     {
         std::vector<TensorDescriptor> inputs_descriptors;
-        for(unsigned int i = 0; i < _input_edges.size(); ++i)
+        for (unsigned int i = 0; i < _input_edges.size(); ++i)
         {
             const Tensor *t = _graph->tensor(input_id(i));
             ARM_COMPUTE_ERROR_ON(t == nullptr);

diff --git a/src/graph/nodes/StridedSliceLayerNode.cpp b/src/graph/nodes/StridedSliceLayerNode.cpp
index 6a1a724..fc9f722 100644
--- a/src/graph/nodes/StridedSliceLayerNode.cpp
+++ b/src/graph/nodes/StridedSliceLayerNode.cpp

@@ -79,7 +79,7 @@
 
 bool StridedSliceLayerNode::forward_descriptors()
 {
-    if((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
+    if ((input_id(0) != NullTensorID) && (output_id(0) != NullTensorID))
     {
         Tensor *dst = output(0);
         ARM_COMPUTE_ERROR_ON(dst == nullptr);

diff --git a/src/graph/printers/DotGraphPrinter.cpp b/src/graph/printers/DotGraphPrinter.cpp
index 9c7c424..5587ed2 100644
--- a/src/graph/printers/DotGraphPrinter.cpp
+++ b/src/graph/printers/DotGraphPrinter.cpp

@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/graph/Graph.h"
+#include "arm_compute/graph/nodes/Nodes.h"
 #include "arm_compute/graph/Tensor.h"
 #include "arm_compute/graph/TypePrinter.h"
-#include "arm_compute/graph/nodes/Nodes.h"
 
 namespace arm_compute
 {
@@ -152,9 +152,9 @@
 
 void DotGraphPrinter::print_nodes(const Graph &g, std::ostream &os)
 {
-    for(const auto &n : g.nodes())
+    for (const auto &n : g.nodes())
     {
-        if(n)
+        if (n)
         {
             // Output node id
             std::string node_id = std::string("n") + support::cpp11::to_string(n->id());
@@ -166,7 +166,8 @@
             std::string name             = n->name().empty() ? node_id : n->name();
             auto        node_description = _dot_node_visitor.info();
 
-            os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description << R"("])";
+            os << R"([label = ")" << name << R"( \n )" << n->assigned_target() << R"( \n )" << node_description
+               << R"("])";
             os << ";\n";
         }
     }
@@ -174,16 +175,17 @@
 
 void DotGraphPrinter::print_edges(const Graph &g, std::ostream &os)
 {
-    for(const auto &e : g.edges())
+    for (const auto &e : g.edges())
     {
-        if(e)
+        if (e)
         {
             std::string source_node_id = std::string("n") + support::cpp11::to_string(e->producer_id());
             std::string sink_node_id   = std::string("n") + support::cpp11::to_string(e->consumer_id());
             os << source_node_id << " -> " << sink_node_id << " ";
             const Tensor *t = e->tensor();
             ARM_COMPUTE_ERROR_ON(t == nullptr);
-            os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )" << t->desc().layout << R"("])";
+            os << R"([label = ")" << t->desc().shape << R"( \n )" << t->desc().data_type << R"( \n )"
+               << t->desc().layout << R"("])";
             os << ";\n";
         }
     }

diff --git a/src/runtime/Allocator.cpp b/src/runtime/Allocator.cpp
index ef7c62d..eca712d 100644
--- a/src/runtime/Allocator.cpp
+++ b/src/runtime/Allocator.cpp

@@ -22,9 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/Allocator.h"
-#include "arm_compute/runtime/MemoryRegion.h"
 
 #include "arm_compute/core/Error.h"
+#include "arm_compute/runtime/MemoryRegion.h"
 
 #include <cstddef>
 

diff --git a/src/runtime/BlobLifetimeManager.cpp b/src/runtime/BlobLifetimeManager.cpp
index bea55d8..8a0fc05 100644
--- a/src/runtime/BlobLifetimeManager.cpp
+++ b/src/runtime/BlobLifetimeManager.cpp

@@ -35,8 +35,7 @@
 
 namespace arm_compute
 {
-BlobLifetimeManager::BlobLifetimeManager()
-    : _blobs()
+BlobLifetimeManager::BlobLifetimeManager() : _blobs()
 {
 }
 
@@ -62,33 +61,32 @@
     ARM_COMPUTE_ERROR_ON(_active_group == nullptr);
 
     // Sort free blobs requirements in descending order.
-    _free_blobs.sort([](const Blob & ba, const Blob & bb)
-    {
-        return ba.max_size > bb.max_size;
-    });
+    _free_blobs.sort([](const Blob &ba, const Blob &bb) { return ba.max_size > bb.max_size; });
 
     // Create group sizes vector
     std::vector<BlobInfo> group_sizes;
-    std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes), [](const Blob & b)
-    {
-        return BlobInfo{ b.max_size, b.max_alignment, b.bound_elements.size() };
-    });
+    std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(group_sizes),
+                   [](const Blob &b) {
+                       return BlobInfo{b.max_size, b.max_alignment, b.bound_elements.size()};
+                   });
 
     // Update blob sizes
     size_t max_size = std::max(_blobs.size(), group_sizes.size());
     _blobs.resize(max_size);
     group_sizes.resize(max_size);
-    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs), [](BlobInfo lhs, BlobInfo rhs)
-    {
-        return BlobInfo{ std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment), std::max(lhs.owners, rhs.owners) };
-    });
+    std::transform(std::begin(_blobs), std::end(_blobs), std::begin(group_sizes), std::begin(_blobs),
+                   [](BlobInfo lhs, BlobInfo rhs)
+                   {
+                       return BlobInfo{std::max(lhs.size, rhs.size), std::max(lhs.alignment, rhs.alignment),
+                                       std::max(lhs.owners, rhs.owners)};
+                   });
 
     // Calculate group mappings
     auto &group_mappings = _active_group->mappings();
     int   blob_idx       = 0;
-    for(auto &free_blob : _free_blobs)
+    for (auto &free_blob : _free_blobs)
     {
-        for(auto &bound_element_id : free_blob.bound_elements)
+        for (auto &bound_element_id : free_blob.bound_elements)
         {
             ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
             Element &bound_element               = _active_elements[bound_element_id];

diff --git a/src/runtime/BlobMemoryPool.cpp b/src/runtime/BlobMemoryPool.cpp
index 88e2805..a2f63ef 100644
--- a/src/runtime/BlobMemoryPool.cpp
+++ b/src/runtime/BlobMemoryPool.cpp

@@ -47,7 +47,7 @@
 void BlobMemoryPool::acquire(MemoryMappings &handles)
 {
     // Set memory to handlers
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(_blobs[handle.second].get());
@@ -56,7 +56,7 @@
 
 void BlobMemoryPool::release(MemoryMappings &handles)
 {
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(nullptr);
@@ -78,7 +78,7 @@
 {
     ARM_COMPUTE_ERROR_ON(!_allocator);
 
-    for(const auto &bi : blob_info)
+    for (const auto &bi : blob_info)
     {
         _blobs.push_back(_allocator->make_region(bi.size, bi.alignment));
     }

diff --git a/src/runtime/CL/CLBufferAllocator.cpp b/src/runtime/CL/CLBufferAllocator.cpp
index e06ef3d..b4545b9 100644
--- a/src/runtime/CL/CLBufferAllocator.cpp
+++ b/src/runtime/CL/CLBufferAllocator.cpp

@@ -35,7 +35,8 @@
 void *CLBufferAllocator::allocate(size_t size, size_t alignment)
 {
     ARM_COMPUTE_UNUSED(alignment);
-    cl_mem buf{ clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size, nullptr, nullptr) };
+    cl_mem buf{clCreateBuffer(CLScheduler::get().context().get(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size,
+                              nullptr, nullptr)};
     return static_cast<void *>(buf);
 }
 

diff --git a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
index 7168259..d680dc0 100644
--- a/src/runtime/CL/CLGEMMHeuristicsHandle.cpp
+++ b/src/runtime/CL/CLGEMMHeuristicsHandle.cpp

@@ -27,8 +27,7 @@
 
 namespace arm_compute
 {
-CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle()
-    : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
+CLGEMMHeuristicsHandle::CLGEMMHeuristicsHandle() : _heuristics(std::make_unique<mlgo::MLGOHeuristics>())
 {
 }
 CLGEMMHeuristicsHandle::~CLGEMMHeuristicsHandle() = default;

diff --git a/src/runtime/CL/CLHelpers.cpp b/src/runtime/CL/CLHelpers.cpp
index 5b4bbbc..eb28ecb 100644
--- a/src/runtime/CL/CLHelpers.cpp
+++ b/src/runtime/CL/CLHelpers.cpp

@@ -50,34 +50,30 @@
  * @return A pointer to the context properties which can be used to create an opencl context
  */
 
-void initialise_context_properties(const cl::Platform &platform, const cl::Device &device, std::array<cl_context_properties, 7> &prop)
+void initialise_context_properties(const cl::Platform                   &platform,
+                                   const cl::Device                     &device,
+                                   std::array<cl_context_properties, 7> &prop)
 {
     ARM_COMPUTE_UNUSED(device);
 #if defined(ARM_COMPUTE_ASSERTS_ENABLED)
     // Query devices in the context for cl_arm_printf support
-    if(arm_compute::device_supports_extension(device, "cl_arm_printf"))
+    if (arm_compute::device_supports_extension(device, "cl_arm_printf"))
     {
         // Create a cl_context with a printf_callback and user specified buffer size.
-        std::array<cl_context_properties, 7> properties_printf =
-        {
+        std::array<cl_context_properties, 7> properties_printf = {
             CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
             // Enable a printf callback function for this context.
             CL_PRINTF_CALLBACK_ARM, reinterpret_cast<cl_context_properties>(printf_callback),
             // Request a minimum printf buffer size of 4MB for devices in the
             // context that support this extension.
-            CL_PRINTF_BUFFERSIZE_ARM, 0x1000,
-            0
-        };
+            CL_PRINTF_BUFFERSIZE_ARM, 0x1000, 0};
         prop = properties_printf;
     }
     else
 #endif // defined(ARM_COMPUTE_ASSERTS_ENABLED)
     {
-        std::array<cl_context_properties, 3> properties =
-        {
-            CL_CONTEXT_PLATFORM, reinterpret_cast<cl_context_properties>(platform()),
-            0
-        };
+        std::array<cl_context_properties, 3> properties = {CL_CONTEXT_PLATFORM,
+                                                           reinterpret_cast<cl_context_properties>(platform()), 0};
         std::copy(properties.begin(), properties.end(), prop.begin());
     };
 }
@@ -91,19 +87,19 @@
     cl::Platform::get(&platforms);
     ARM_COMPUTE_ERROR_ON_MSG(platforms.size() == 0, "Couldn't find any OpenCL platform");
 
-    cl::Platform selected_platform{ nullptr };
+    cl::Platform selected_platform{nullptr};
 
     // If the user has selected the Native platform, return the first available.
-    switch(cl_backend_type)
+    switch (cl_backend_type)
     {
         case CLBackendType::Native:
             selected_platform = platforms[0];
             break;
         case CLBackendType::Clvk:
-            for(auto p : platforms)
+            for (auto p : platforms)
             {
                 std::string res = p.getInfo<CL_PLATFORM_NAME>();
-                if(res.find("clvk") != std::string::npos)
+                if (res.find("clvk") != std::string::npos)
                 {
                     selected_platform = p;
                     break;
@@ -114,7 +110,7 @@
             ARM_COMPUTE_ERROR("Unsupported backend type");
     }
 
-    if(!selected_platform())
+    if (!selected_platform())
     {
         ARM_COMPUTE_ERROR("No valid platform found");
     }
@@ -122,8 +118,7 @@
     return selected_platform;
 }
 
-std::tuple<cl::Context, cl::Device, cl_int>
-create_opencl_context_and_device(CLBackendType cl_backend_type)
+std::tuple<cl::Context, cl::Device, cl_int> create_opencl_context_and_device(CLBackendType cl_backend_type)
 {
     ARM_COMPUTE_ERROR_ON(!opencl_is_available());
     cl::Platform            p = select_preferable_platform(cl_backend_type);
@@ -131,9 +126,9 @@
     std::vector<cl::Device> platform_devices;
     p.getDevices(CL_DEVICE_TYPE_DEFAULT, &platform_devices);
     ARM_COMPUTE_ERROR_ON_MSG(platform_devices.size() == 0, "Couldn't find any OpenCL device");
-    device     = platform_devices[0];
-    cl_int err = CL_SUCCESS;
-    std::array<cl_context_properties, 7> properties = { 0, 0, 0, 0, 0, 0, 0 };
+    device                                          = platform_devices[0];
+    cl_int                               err        = CL_SUCCESS;
+    std::array<cl_context_properties, 7> properties = {0, 0, 0, 0, 0, 0, 0};
     initialise_context_properties(p, device, properties);
     cl::Context cl_context = cl::Context(device, properties.data(), nullptr, nullptr, &err);
     ARM_COMPUTE_ERROR_ON_MSG(err != CL_SUCCESS, "Failed to create OpenCL context");
@@ -143,7 +138,7 @@
 void schedule_kernel_on_ctx(CLRuntimeContext *ctx, ICLKernel *kernel, bool flush)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(kernel);
-    if(ctx)
+    if (ctx)
     {
         ARM_COMPUTE_ERROR_ON(ctx->gpu_scheduler() == nullptr);
         ctx->gpu_scheduler()->enqueue(*kernel, flush);

diff --git a/src/runtime/CL/CLMemory.cpp b/src/runtime/CL/CLMemory.cpp
index a1743c5..c6ee6fd 100644
--- a/src/runtime/CL/CLMemory.cpp
+++ b/src/runtime/CL/CLMemory.cpp

@@ -24,24 +24,22 @@
 #include "arm_compute/runtime/CL/CLMemory.h"
 
 #include "arm_compute/core/Error.h"
+
 #include "support/Cast.h"
 
 namespace arm_compute
 {
-CLMemory::CLMemory()
-    : _region(nullptr), _region_owned(nullptr)
+CLMemory::CLMemory() : _region(nullptr), _region_owned(nullptr)
 {
 }
 
-CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
+CLMemory::CLMemory(const std::shared_ptr<ICLMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
 }
 
-CLMemory::CLMemory(ICLMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
+CLMemory::CLMemory(ICLMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
 {
     _region = memory;
 }
@@ -78,4 +76,4 @@
     _region_owned = utils::cast::polymorphic_downcast_unique_ptr<ICLMemoryRegion>(std::move(region));
     _region       = _region_owned.get();
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/CLMemoryRegion.cpp b/src/runtime/CL/CLMemoryRegion.cpp
index 00f91a0..835958b 100644
--- a/src/runtime/CL/CLMemoryRegion.cpp
+++ b/src/runtime/CL/CLMemoryRegion.cpp

@@ -29,10 +29,7 @@
 namespace arm_compute
 {
 ICLMemoryRegion::ICLMemoryRegion(size_t size)
-    : IMemoryRegion(size),
-      _ctx(CLScheduler::get().context()),
-      _mapping(nullptr),
-      _mem()
+    : IMemoryRegion(size), _ctx(CLScheduler::get().context()), _mapping(nullptr), _mem()
 {
 }
 
@@ -57,17 +54,15 @@
     return nullptr;
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size)
-    : ICLMemoryRegion(size)
+CLBufferMemoryRegion::CLBufferMemoryRegion(cl_mem_flags flags, size_t size) : ICLMemoryRegion(size)
 {
-    if(_size != 0)
+    if (_size != 0)
     {
         _mem = cl::Buffer(CLScheduler::get().context(), flags, _size);
     }
 }
 
-CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer)
-    : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
+CLBufferMemoryRegion::CLBufferMemoryRegion(const cl::Buffer &buffer) : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())
 {
     _mem = buffer;
 }
@@ -102,10 +97,10 @@
 ICLSVMMemoryRegion::ICLSVMMemoryRegion(cl_mem_flags flags, size_t size, size_t alignment)
     : ICLMemoryRegion(size), _ptr(nullptr)
 {
-    if(size != 0)
+    if (size != 0)
     {
         _ptr = clSVMAlloc(CLScheduler::get().context().get(), flags, size, alignment);
-        if(_ptr != nullptr)
+        if (_ptr != nullptr)
         {
             _mem = cl::Buffer(CLScheduler::get().context(), CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, _size, _ptr);
         }
@@ -114,7 +109,7 @@
 
 ICLSVMMemoryRegion::~ICLSVMMemoryRegion()
 {
-    if(_ptr != nullptr)
+    if (_ptr != nullptr)
     {
         try
         {
@@ -125,7 +120,7 @@
             _mem = cl::Buffer();
             clSVMFree(_ctx.get(), _ptr);
         }
-        catch(...)
+        catch (...)
         {
         }
     }
@@ -144,7 +139,8 @@
 void *CLCoarseSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(_ptr == nullptr);
-    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr, nullptr);
+    clEnqueueSVMMap(q.get(), blocking ? CL_TRUE : CL_FALSE, CL_MAP_READ | CL_MAP_WRITE, _ptr, _size, 0, nullptr,
+                    nullptr);
     _mapping = _ptr;
     return _mapping;
 }
@@ -163,7 +159,7 @@
 
 void *CLFineSVMMemoryRegion::map(cl::CommandQueue &q, bool blocking)
 {
-    if(blocking)
+    if (blocking)
     {
         clFinish(q.get());
     }

diff --git a/src/runtime/CL/CLOperator.cpp b/src/runtime/CL/CLOperator.cpp
index 075a544..89d4520 100644
--- a/src/runtime/CL/CLOperator.cpp
+++ b/src/runtime/CL/CLOperator.cpp

@@ -30,14 +30,13 @@
 {
 namespace experimental
 {
-ICLOperator::ICLOperator(IRuntimeContext *ctx)
-    : _kernel(), _ctx(ctx), _workspace()
+ICLOperator::ICLOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
 {
 }
 
 void ICLOperator::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }

diff --git a/src/runtime/CL/CLRuntimeContext.cpp b/src/runtime/CL/CLRuntimeContext.cpp
index 5083b4b..b426b8c 100644
--- a/src/runtime/CL/CLRuntimeContext.cpp
+++ b/src/runtime/CL/CLRuntimeContext.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
@@ -29,7 +30,10 @@
 namespace arm_compute
 {
 CLRuntimeContext::CLRuntimeContext()
-    : _gpu_owned_scheduler(std::make_unique<CLScheduler>()), _gpu_scheduler(_gpu_owned_scheduler.get()), _symbols(), _backend_type()
+    : _gpu_owned_scheduler(std::make_unique<CLScheduler>()),
+      _gpu_scheduler(_gpu_owned_scheduler.get()),
+      _symbols(),
+      _backend_type()
 {
     _symbols.load_default();
     auto ctx_dev_err = create_opencl_context_and_device(_backend_type);

diff --git a/src/runtime/CL/CLScheduler.cpp b/src/runtime/CL/CLScheduler.cpp
index b7a4dff..f0a42f5 100644
--- a/src/runtime/CL/CLScheduler.cpp
+++ b/src/runtime/CL/CLScheduler.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLTuner.h"
+
 #include "src/core/CL/ICLKernel.h"
 
 namespace arm_compute
@@ -81,7 +82,7 @@
 
 void CLScheduler::tune_kernel_static(ICLKernel &kernel)
 {
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
         _cl_tuner->tune_kernel_static(kernel);
     }
@@ -95,8 +96,16 @@
 std::once_flag CLScheduler::_initialize_symbols;
 
 CLScheduler::CLScheduler()
-    : _context(), _queue(), _target(GPUTarget::MIDGARD), _is_initialised(false), _cl_tuner(nullptr), _gemm_heuristics(nullptr), _backend_type(CLBackendType::Native), _job_chaining_enabled(true),
-      _job_chaining_size(1), _job_chaining_count(0)
+    : _context(),
+      _queue(),
+      _target(GPUTarget::MIDGARD),
+      _is_initialised(false),
+      _cl_tuner(nullptr),
+      _gemm_heuristics(nullptr),
+      _backend_type(CLBackendType::Native),
+      _job_chaining_enabled(true),
+      _job_chaining_size(1),
+      _job_chaining_count(0)
 {
 }
 
@@ -107,9 +116,12 @@
     return scheduler;
 }
 
-void CLScheduler::default_init_with_context(cl::Device &device, cl::Context &ctx, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h)
+void CLScheduler::default_init_with_context(cl::Device             &device,
+                                            cl::Context            &ctx,
+                                            ICLTuner               *cl_tuner,
+                                            CLGEMMHeuristicsHandle *gemm_h)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         const std::string cl_kernels_folder("./cl_kernels/");
         cl::CommandQueue  queue = cl::CommandQueue(ctx, device);
@@ -121,7 +133,7 @@
 
 void CLScheduler::default_init(ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
 {
-    if(!_is_initialised)
+    if (!_is_initialised)
     {
         cl::Context ctx;
         cl::Device  dev;
@@ -151,7 +163,12 @@
     CLKernelLibrary::get().set_context(_context);
 }
 
-void CLScheduler::init(cl::Context context, cl::CommandQueue queue, const cl::Device &device, ICLTuner *cl_tuner, CLGEMMHeuristicsHandle *gemm_h, CLBackendType cl_backend_type)
+void CLScheduler::init(cl::Context             context,
+                       cl::CommandQueue        queue,
+                       const cl::Device       &device,
+                       ICLTuner               *cl_tuner,
+                       CLGEMMHeuristicsHandle *gemm_h,
+                       CLBackendType           cl_backend_type)
 {
     set_context(std::move(context));
     _queue           = std::move(queue);
@@ -164,21 +181,21 @@
 
 void CLScheduler::enqueue_common(ICLKernel &kernel, ITensorPack &tensors, bool flush)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!_is_initialised,
-                             "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
+    ARM_COMPUTE_ERROR_ON_MSG(
+        !_is_initialised, "The CLScheduler is not initialised yet! Please call the CLScheduler::get().default_init(), \
                              or CLScheduler::get()::init() and CLKernelLibrary::get()::init() function before running functions!");
 
     const bool inject_memory = !tensors.empty();
 
     // Tune the kernel if the CLTuner has been provided
-    if(_cl_tuner != nullptr)
+    if (_cl_tuner != nullptr)
     {
         inject_memory ? _cl_tuner->tune_kernel_dynamic(kernel, tensors) : _cl_tuner->tune_kernel_dynamic(kernel);
     }
 
     // Run kernel
     inject_memory ? kernel.run_op(tensors, kernel.window(), _queue) : kernel.run(kernel.window(), _queue);
-    if(_job_chaining_enabled)
+    if (_job_chaining_enabled)
     {
         ++_job_chaining_count;
     }
@@ -188,9 +205,9 @@
 
 void CLScheduler::flush_queue(bool flush)
 {
-    if(_job_chaining_enabled)
+    if (_job_chaining_enabled)
     {
-        if(_job_chaining_count >= _job_chaining_size)
+        if (_job_chaining_count >= _job_chaining_size)
         {
             _job_chaining_count = 0;
             /*
@@ -199,14 +216,14 @@
                 the CPU activity for job-scheduling.
                 For eg. job-chain size goes from 1, 2, 4, 8 and 16
             */
-            if(_job_chaining_size < 16)
+            if (_job_chaining_size < 16)
             {
                 _job_chaining_size <<= 1;
             }
             _queue.flush();
         }
     }
-    else if(flush)
+    else if (flush)
     {
         _queue.flush();
     }

diff --git a/src/runtime/CL/CLSubTensor.cpp b/src/runtime/CL/CLSubTensor.cpp
index 14936ae..ace820b 100644
--- a/src/runtime/CL/CLSubTensor.cpp
+++ b/src/runtime/CL/CLSubTensor.cpp

@@ -29,12 +29,14 @@
 
 using namespace arm_compute;
 
-CLSubTensor::CLSubTensor()
-    : _parent(nullptr), _info()
+CLSubTensor::CLSubTensor() : _parent(nullptr), _info()
 {
 }
 
-CLSubTensor::CLSubTensor(ICLTensor *parent, const TensorShape &tensor_shape, const Coordinates &coords, bool extend_parent)
+CLSubTensor::CLSubTensor(ICLTensor         *parent,
+                         const TensorShape &tensor_shape,
+                         const Coordinates &coords,
+                         bool               extend_parent)
     : _parent(nullptr), _info()
 {
     ARM_COMPUTE_ERROR_ON(parent == nullptr);
@@ -81,7 +83,7 @@
 uint8_t *CLSubTensor::do_map(cl::CommandQueue &q, bool blocking)
 {
     ARM_COMPUTE_ERROR_ON(cl_buffer().get() == nullptr);
-    if(_parent->buffer() == nullptr)
+    if (_parent->buffer() == nullptr)
     {
         _parent->map(q, blocking);
     }

diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp
index f85b8ae..e645721 100644
--- a/src/runtime/CL/CLTensorAllocator.cpp
+++ b/src/runtime/CL/CLTensorAllocator.cpp

@@ -46,17 +46,16 @@
 std::unique_ptr<ICLMemoryRegion> allocate_region(size_t size, cl_uint alignment)
 {
     // Try fine-grain SVM
-    std::unique_ptr<ICLMemoryRegion> region = std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER,
-                                                                                      size,
-                                                                                      alignment);
+    std::unique_ptr<ICLMemoryRegion> region =
+        std::make_unique<CLFineSVMMemoryRegion>(CL_MEM_READ_WRITE | CL_MEM_SVM_FINE_GRAIN_BUFFER, size, alignment);
 
     // Try coarse-grain SVM in case of failure
-    if(region != nullptr && region->ptr() == nullptr)
+    if (region != nullptr && region->ptr() == nullptr)
     {
         region = std::make_unique<CLCoarseSVMMemoryRegion>(CL_MEM_READ_WRITE, size, alignment);
     }
     // Try legacy buffer memory in case of failure
-    if(region != nullptr && region->ptr() == nullptr)
+    if (region != nullptr && region->ptr() == nullptr)
     {
         region = std::make_unique<CLBufferMemoryRegion>(CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, size);
     }
@@ -80,7 +79,10 @@
  * @param[in]      qinfo    Quantization info
  * @param[in]      pad_size Pad size to use in case array needs to be padded for computation purposes
  */
-void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const QuantizationInfo &qinfo, size_t pad_size)
+void populate_quantization_info(CLFloatArray           &scale,
+                                CLInt32Array           &offset,
+                                const QuantizationInfo &qinfo,
+                                size_t                  pad_size)
 {
     clear_quantization_arrays(scale, offset);
 
@@ -90,16 +92,18 @@
     const size_t              element_size = sizeof(std::remove_reference<decltype(qscale)>::type::value_type);
     scale                                  = CLFloatArray(num_elements + pad_size);
     scale.resize(num_elements);
-    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data());
+    CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size,
+                                                  qinfo.scale().data());
 
-    if(!qinfo.offset().empty())
+    if (!qinfo.offset().empty())
     {
         // Create offset array
-        const std::vector<int32_t> &qoffset             = qinfo.offset();
-        const size_t                offset_element_size = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
-        offset                                          = CLInt32Array(num_elements + pad_size);
+        const std::vector<int32_t> &qoffset = qinfo.offset();
+        const size_t offset_element_size    = sizeof(std::remove_reference<decltype(qoffset)>::type::value_type);
+        offset                              = CLInt32Array(num_elements + pad_size);
         offset.resize(num_elements);
-        CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0, num_elements * offset_element_size, qinfo.offset().data());
+        CLScheduler::get().queue().enqueueWriteBuffer(offset.cl_buffer(), CL_TRUE, 0,
+                                                      num_elements * offset_element_size, qinfo.offset().data());
     }
 }
 } // namespace
@@ -111,7 +115,7 @@
 
 CLQuantization CLTensorAllocator::quantization() const
 {
-    return { &_scale, &_offset };
+    return {&_scale, &_offset};
 }
 
 uint8_t *CLTensorAllocator::data()
@@ -127,10 +131,10 @@
 void CLTensorAllocator::allocate()
 {
     // Allocate tensor backing memory
-    if(_associated_memory_group == nullptr)
+    if (_associated_memory_group == nullptr)
     {
         // Perform memory allocation
-        if(static_global_cl_allocator != nullptr)
+        if (static_global_cl_allocator != nullptr)
         {
             _memory.set_owned_region(static_global_cl_allocator->make_region(info().total_size(), 0));
         }
@@ -146,7 +150,7 @@
     }
 
     // Allocate and fill the quantization parameter arrays
-    if(is_data_type_quantized_per_channel(info().data_type()))
+    if (is_data_type_quantized_per_channel(info().data_type()))
     {
         const size_t pad_size = 0;
         populate_quantization_info(_scale, _offset, info().quantization_info(), pad_size);
@@ -193,7 +197,7 @@
 
 uint8_t *CLTensorAllocator::lock()
 {
-    if(_ctx)
+    if (_ctx)
     {
         return map(_ctx->gpu_scheduler()->queue(), true);
     }
@@ -206,7 +210,7 @@
 void CLTensorAllocator::unlock()
 {
     ARM_COMPUTE_ERROR_ON(_memory.region() == nullptr);
-    if(_ctx)
+    if (_ctx)
     {
         unmap(_ctx->gpu_scheduler()->queue(), reinterpret_cast<uint8_t *>(_memory.region()->buffer()));
     }

diff --git a/src/runtime/CL/CLTuner.cpp b/src/runtime/CL/CLTuner.cpp
index 445638f..0d62fe3 100644
--- a/src/runtime/CL/CLTuner.cpp
+++ b/src/runtime/CL/CLTuner.cpp

@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/CLTuner.h"
-#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+#include "arm_compute/runtime/CL/tuners/CLTuningParametersList.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "support/StringSupport.h"
@@ -37,19 +38,23 @@
 namespace arm_compute
 {
 CLTuner::CLTuner(bool tune_new_kernels, CLTuningInfo tuning_info)
-    : real_clEnqueueNDRangeKernel(nullptr), _tuning_params_table(), _lws_table(), _kernel_event(), _tune_new_kernels(tune_new_kernels), _tuning_info(tuning_info)
+    : real_clEnqueueNDRangeKernel(nullptr),
+      _tuning_params_table(),
+      _lws_table(),
+      _kernel_event(),
+      _tune_new_kernels(tune_new_kernels),
+      _tuning_info(tuning_info)
 {
 }
 
 struct CLTuner::IKernelData
 {
-    virtual ~IKernelData() = default;
+    virtual ~IKernelData()                                          = default;
     virtual void do_run(ICLKernel &kernel, cl::CommandQueue &queue) = 0;
 };
 struct DefaultKernelData : public CLTuner::IKernelData
 {
-    DefaultKernelData(ITensorPack &tensors)
-        : _tensors{ tensors }
+    DefaultKernelData(ITensorPack &tensors) : _tensors{tensors}
     {
     }
     ~DefaultKernelData() override = default;
@@ -100,16 +105,17 @@
 void CLTuner::do_tune_kernel_dynamic(ICLKernel &kernel, IKernelData *data)
 {
     // Get the configuration ID from the kernel and append GPU target name and number of available compute units
-    const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" + support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
+    const std::string config_id = kernel.config_id() + "_" + string_from_target(kernel.get_target()) + "_MP" +
+                                  support::cpp11::to_string(CLKernelLibrary::get().get_num_compute_units());
 
     // Check if we need to find the Optimal LWS. If the kernel's config_id is equal to default_config_id, the kernel does not require to be tuned
-    if(kernel.config_id() != arm_compute::default_config_id)
+    if (kernel.config_id() != arm_compute::default_config_id)
     {
         auto p = _tuning_params_table.find(config_id);
 
-        if(p == _tuning_params_table.end())
+        if (p == _tuning_params_table.end())
         {
-            if(_tune_new_kernels)
+            if (_tune_new_kernels)
             {
                 // Find the optimal LWS for the kernel
                 CLTuningParams opt_tuning_params = find_optimal_tuning_params(kernel, data);
@@ -119,7 +125,7 @@
 
                 // Set Local-Workgroup-Size
                 kernel.set_lws_hint(opt_tuning_params.get_lws());
-                if(_tuning_info.tune_wbsm)
+                if (_tuning_info.tune_wbsm)
                 {
                     kernel.set_wbsm_hint(opt_tuning_params.get_wbsm());
                 }
@@ -129,7 +135,7 @@
         {
             // Set Local-Workgroup-Size
             kernel.set_lws_hint(p->second.get_lws());
-            if(_tuning_info.tune_wbsm)
+            if (_tuning_info.tune_wbsm)
             {
                 kernel.set_wbsm_hint(p->second.get_wbsm());
             }
@@ -138,7 +144,7 @@
 }
 void CLTuner::tune_kernel_dynamic(ICLKernel &kernel, ITensorPack &tensors)
 {
-    DefaultKernelData data{ tensors };
+    DefaultKernelData data{tensors};
 
     do_tune_kernel_dynamic(kernel, &data);
 }
@@ -154,7 +160,7 @@
     cl::CommandQueue queue_profiler;
 
     // Extract real OpenCL function to intercept
-    if(real_clEnqueueNDRangeKernel == nullptr)
+    if (real_clEnqueueNDRangeKernel == nullptr)
     {
         real_clEnqueueNDRangeKernel = CLSymbols::get().clEnqueueNDRangeKernel_ptr;
     }
@@ -165,7 +171,7 @@
     // Check if we can use the OpenCL timer with the default queue
     cl_command_queue_properties props = default_queue.getInfo<CL_QUEUE_PROPERTIES>();
 
-    if((props & CL_QUEUE_PROFILING_ENABLE) == 0)
+    if ((props & CL_QUEUE_PROFILING_ENABLE) == 0)
     {
         // Set the queue for profiling
         queue_profiler = cl::CommandQueue(CLScheduler::get().context(), props | CL_QUEUE_PROFILING_ENABLE);
@@ -176,21 +182,23 @@
     }
 
     // Start intercepting enqueues:
-    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo, const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
-                              const cl_event * event_wait_list, cl_event * event)
+    auto interceptor = [this](cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *gwo,
+                              const size_t *gws, const size_t *lws, cl_uint num_events_in_wait_list,
+                              const cl_event *event_wait_list, cl_event *event)
     {
-        if(this->kernel_event_is_set())
+        if (this->kernel_event_is_set())
         {
             // If the event is already set it means the kernel enqueue is sliced: given that we only time the first slice we can save time by skipping the other enqueues.
             return CL_SUCCESS;
         }
         cl_event tmp;
-        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws, num_events_in_wait_list, event_wait_list, &tmp);
+        cl_int   retval = this->real_clEnqueueNDRangeKernel(command_queue, kernel, work_dim, gwo, gws, lws,
+                                                            num_events_in_wait_list, event_wait_list, &tmp);
 
         // Set OpenCL event
         this->set_cl_kernel_event(tmp);
 
-        if(event != nullptr)
+        if (event != nullptr)
         {
             //return cl_event from the intercepted call
             clRetainEvent(tmp);
@@ -209,9 +217,10 @@
     /// This is only a temporary workaround. An ideal solution involves decoupling the execution window from run() / run_op()
     /// Please see COMPMID-5934
     cl::NDRange gws = kernel.get_cached_gws();
-    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
-                                        "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search",
-                                        kernel.config_id().c_str(), to_string(gws).c_str());
+    ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(
+        arm_compute::logging::LogLevel::INFO,
+        "[CLTuner] Kernel with config_id '%s' uses %s as the upper-bound for lws search", kernel.config_id().c_str(),
+        to_string(gws).c_str());
 
     queue_profiler.finish();
 
@@ -224,7 +233,7 @@
 
     // Construct the list of tuning parameters values to be tested based on the tuner mode.
     auto tuning_list = cl_tuner::get_tuning_parameters_list(_tuning_info, gws);
-    for(size_t i = 0; i < tuning_list->size(); ++i)
+    for (size_t i = 0; i < tuning_list->size(); ++i)
     {
         CLTuningParams tuning_test = (*tuning_list)[i];
         // Setting the lws
@@ -234,19 +243,18 @@
         auto        z           = lws_test[2];
         const bool  invalid_lws = (x * y * z > kernel.get_max_workgroup_size()) || (x == 1 && y == 1 && z == 1);
 
-        if(invalid_lws)
+        if (invalid_lws)
         {
             continue;
         }
 
         kernel.set_lws_hint(lws_test);
-        if(_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
+        if (_tuning_info.tune_wbsm && CLKernelLibrary::get().is_wbsm_supported())
         {
             cl_int wbsm_test = tuning_test.get_wbsm();
             kernel.set_wbsm_hint(wbsm_test);
         }
-        ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO,
-                                            "[CLTuner] Trying LWS: %s, WBSM: %d",
+        ARM_COMPUTE_LOG_MSG_WITH_FORMAT_ACL(arm_compute::logging::LogLevel::INFO, "[CLTuner] Trying LWS: %s, WBSM: %d",
                                             to_string(kernel.lws_hint()).c_str(), kernel.wbsm_hint());
 
         // Run the kernel
@@ -260,11 +268,11 @@
         _kernel_event        = nullptr;
 
         // Check the execution time
-        if(diff < min_exec_time)
+        if (diff < min_exec_time)
         {
             min_exec_time = diff;
             opt_tuning_params.set_lws(tuning_test.get_lws());
-            if(_tuning_info.tune_wbsm)
+            if (_tuning_info.tune_wbsm)
             {
                 opt_tuning_params.set_wbsm(tuning_test.get_wbsm());
             }
@@ -292,30 +300,30 @@
     std::ifstream fs;
     fs.exceptions(std::ifstream::badbit);
     fs.open(filename, std::ios::in);
-    if(!fs.is_open())
+    if (!fs.is_open())
     {
         ARM_COMPUTE_ERROR_VAR("Failed to open '%s' (%s [%d])", filename.c_str(), strerror(errno), errno);
     }
     std::string line;
     bool        header_line = true;
-    while(!std::getline(fs, line).fail())
+    while (!std::getline(fs, line).fail())
     {
-        if(header_line)
+        if (header_line)
         {
             header_line            = false;
             size_t pos_lws         = line.find("lws");
             size_t pos_wbsm        = line.find("wbsm");
             _tuning_info.tune_wbsm = false;
-            if(pos_lws != std::string::npos || pos_wbsm != std::string::npos)
+            if (pos_lws != std::string::npos || pos_wbsm != std::string::npos)
             {
                 // The file has in the first line the parameters it has been tuned on
-                if(pos_wbsm != std::string::npos)
+                if (pos_wbsm != std::string::npos)
                 {
                     _tuning_info.tune_wbsm = true;
                 }
                 // Once the line with the tuning parameter is read we can
                 // read the next one to start collecting the values
-                if(std::getline(fs, line).fail())
+                if (std::getline(fs, line).fail())
                 {
                     break;
                 }
@@ -324,13 +332,13 @@
 
         CLTuningParams tuning_params;
         size_t         pos = line.find(";");
-        if(pos == std::string::npos)
+        if (pos == std::string::npos)
         {
             ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
         }
         std::string kernel_id = line.substr(0, pos);
         line.erase(0, pos + 1);
-        if(!tuning_params.from_string(_tuning_info, line))
+        if (!tuning_params.from_string(_tuning_info, line))
         {
             ARM_COMPUTE_ERROR_VAR("Malformed row '%s' in %s", line.c_str(), filename.c_str());
         }
@@ -341,7 +349,7 @@
 
 bool CLTuner::save_to_file(const std::string &filename) const
 {
-    if(!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
+    if (!_tune_new_kernels || _tuning_params_table.empty() || filename.empty())
     {
         return false;
     }
@@ -350,16 +358,16 @@
     fs.open(filename, std::ios::out);
     std::string header_string = "";
     header_string += "lws";
-    if(_tuning_info.tune_wbsm)
+    if (_tuning_info.tune_wbsm)
     {
-        if(!header_string.empty())
+        if (!header_string.empty())
         {
             header_string += " ";
         }
         header_string += "wbsm";
     }
     fs << header_string << std::endl;
-    for(auto const &kernel_data : _tuning_params_table)
+    for (auto const &kernel_data : _tuning_params_table)
     {
         CLTuningParams tun_pams(kernel_data.second);
         fs << kernel_data.first << tun_pams.to_string(_tuning_info) << std::endl;

diff --git a/src/runtime/CL/ICLSimpleFunction.cpp b/src/runtime/CL/ICLSimpleFunction.cpp
index 4530537..bc782c3 100644
--- a/src/runtime/CL/ICLSimpleFunction.cpp
+++ b/src/runtime/CL/ICLSimpleFunction.cpp

@@ -26,15 +26,14 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 using namespace arm_compute;
 
 ICLSimpleFunction::ICLSimpleFunction(CLRuntimeContext *ctx) // NOLINT
-    : _kernel(),
-      _border_handler(std::make_unique<CLFillBorderKernel>()),
-      _ctx(ctx)
+    : _kernel(), _border_handler(std::make_unique<CLFillBorderKernel>()), _ctx(ctx)
 {
 }
 

diff --git a/src/runtime/CL/Utils.cpp b/src/runtime/CL/Utils.cpp
index da3d485..294396c 100644
--- a/src/runtime/CL/Utils.cpp
+++ b/src/runtime/CL/Utils.cpp

@@ -35,20 +35,20 @@
 void restore_program_cache_from_file(const std::string &filename)
 {
     std::ifstream cache_file(filename, std::ios::binary);
-    if(cache_file.is_open())
+    if (cache_file.is_open())
     {
-        if(!CLScheduler::get().is_initialised())
+        if (!CLScheduler::get().is_initialised())
         {
             arm_compute::CLScheduler::get().default_init();
         }
 
-        while(!cache_file.eof())
+        while (!cache_file.eof())
         {
             size_t name_len   = 0;
             size_t binary_len = 0;
             cache_file.read(reinterpret_cast<char *>(&name_len), sizeof(size_t));
             cache_file.read(reinterpret_cast<char *>(&binary_len), sizeof(size_t));
-            if(name_len == 0 || binary_len == 0)
+            if (name_len == 0 || binary_len == 0)
             {
                 break;
             }
@@ -60,7 +60,7 @@
             tmp.resize(binary_len);
             cache_file.read(reinterpret_cast<char *>(binary.data()), binary_len);
             cl::Context             context = arm_compute::CLScheduler::get().context();
-            cl::Program::Binaries   binaries{ binary };
+            cl::Program::Binaries   binaries{binary};
             std::vector<cl::Device> devices = context.getInfo<CL_CONTEXT_DEVICES>();
             cl::Program             program(context, devices, binaries);
             program.build();
@@ -72,12 +72,12 @@
 
 void save_program_cache_to_file(const std::string &filename)
 {
-    if(CLScheduler::get().is_initialised())
+    if (CLScheduler::get().is_initialised())
     {
         std::ofstream cache_file(filename, std::ios::binary);
-        if(cache_file.is_open())
+        if (cache_file.is_open())
         {
-            for(const auto &it : CLKernelLibrary::get().get_built_programs())
+            for (const auto &it : CLKernelLibrary::get().get_built_programs())
             {
                 std::vector<std::vector<unsigned char>> binaries = it.second.getInfo<CL_PROGRAM_BINARIES>();
                 ARM_COMPUTE_ERROR_ON(binaries.size() != 1);

diff --git a/src/runtime/CL/functions/CLActivationLayer.cpp b/src/runtime/CL/functions/CLActivationLayer.cpp
index f324b1a..c035644e 100644
--- a/src/runtime/CL/functions/CLActivationLayer.cpp
+++ b/src/runtime/CL/functions/CLActivationLayer.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLRuntimeContext.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClActivation.h"
 
@@ -35,18 +36,17 @@
 {
 struct CLActivationLayer::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    CLRuntimeContext                     *ctx{ nullptr };
-    std::unique_ptr<opencl::ClActivation> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    CLRuntimeContext                     *ctx{nullptr};
+    std::unique_ptr<opencl::ClActivation> op{nullptr};
 };
 
-CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+CLActivationLayer::CLActivationLayer(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
-CLActivationLayer::CLActivationLayer(CLActivationLayer &&) = default;
+CLActivationLayer::CLActivationLayer(CLActivationLayer &&)            = default;
 CLActivationLayer &CLActivationLayer::operator=(CLActivationLayer &&) = default;
 CLActivationLayer::~CLActivationLayer()                               = default;
 
@@ -55,7 +55,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, act_info);
 }
 
-void CLActivationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
+void CLActivationLayer::configure(const CLCompileContext &compile_context,
+                                  ICLTensor              *input,
+                                  ICLTensor              *output,
+                                  ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -66,7 +69,8 @@
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), act_info);
 }
 
-Status CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+CLActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     return opencl::ClActivation::validate(input, output, act_info);
 }

diff --git a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
index b30d739..f9bbd31 100644
--- a/src/runtime/CL/functions/CLArgMinMaxLayer.cpp
+++ b/src/runtime/CL/functions/CLArgMinMaxLayer.cpp

@@ -27,31 +27,39 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/kernels/CLArgMinMaxLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLArgMinMaxLayer::CLArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _not_reshaped_output(), _arg_min_max_kernel(), _reshape(), _reduction_axis()
+    : _memory_group(std::move(memory_manager)),
+      _not_reshaped_output(),
+      _arg_min_max_kernel(),
+      _reshape(),
+      _reduction_axis()
 {
 }
 
 CLArgMinMaxLayer::~CLArgMinMaxLayer() = default;
 
-Status CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+CLArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::S32, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid reduction operation");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions), "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::S32, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Invalid reduction operation");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= static_cast<int>(TensorShape::num_max_dimensions),
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     DataType   output_data_type = DataType::S32;
@@ -59,17 +67,18 @@
     const auto input_num_channles = input->num_channels();
     const auto input_qinfo        = input->quantization_info();
 
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
         output_data_type                       = output->data_type();
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, false));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
     auto shape_before_reshape = input->tensor_shape();
     shape_before_reshape.set(axis, 1);
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
@@ -85,20 +94,36 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output, op);
 }
 
-void CLArgMinMaxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int axis, ICLTensor *output, const ReductionOperation &op)
+void CLArgMinMaxLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 int                       axis,
+                                 ICLTensor                *output,
+                                 const ReductionOperation &op)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
 
     _reduction_axis = axis;
 
-    const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-    DataType          output_data_type = (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+    DataType output_data_type =
+        (output->info()->data_type() == DataType::UNKNOWN) ? DataType::S32 : output->info()->data_type();
+    auto_init_if_empty(*output->info(), input->info()
+                                            ->clone()
+                                            ->set_tensor_shape(output_shape)
+                                            .set_data_type(output_data_type)
+                                            .reset_padding()
+                                            .set_is_resizable(true));
 
-    TensorShape not_reshaped_output_shape{ input->info()->tensor_shape() };
+    TensorShape not_reshaped_output_shape{input->info()->tensor_shape()};
     not_reshaped_output_shape.set(axis, 1);
-    auto_init_if_empty(*_not_reshaped_output.info(), input->info()->clone()->set_tensor_shape(not_reshaped_output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+    auto_init_if_empty(*_not_reshaped_output.info(), input->info()
+                                                         ->clone()
+                                                         ->set_tensor_shape(not_reshaped_output_shape)
+                                                         .set_data_type(output_data_type)
+                                                         .reset_padding()
+                                                         .set_is_resizable(true));
 
     _arg_min_max_kernel = std::make_unique<CLArgMinMaxLayerKernel>();
     _arg_min_max_kernel->configure(compile_context, input, &_not_reshaped_output, axis, op);

diff --git a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
index e8affc0..0c371c4 100644
--- a/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchNormalizationLayer.cpp

@@ -30,9 +30,8 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchNormalizationLayerKernel.h"
 
 namespace arm_compute
 {
@@ -43,24 +42,40 @@
 
 CLBatchNormalizationLayer::~CLBatchNormalizationLayer() = default;
 
-void CLBatchNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta, const ICLTensor *gamma, float epsilon,
+void CLBatchNormalizationLayer::configure(ICLTensor          *input,
+                                          ICLTensor          *output,
+                                          const ICLTensor    *mean,
+                                          const ICLTensor    *var,
+                                          const ICLTensor    *beta,
+                                          const ICLTensor    *gamma,
+                                          float               epsilon,
                                           ActivationLayerInfo act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *var, const ICLTensor *beta,
-                                          const ICLTensor *gamma, float epsilon,
-                                          ActivationLayerInfo act_info)
+void CLBatchNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                          ICLTensor              *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *var,
+                                          const ICLTensor        *beta,
+                                          const ICLTensor        *gamma,
+                                          float                   epsilon,
+                                          ActivationLayerInfo     act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
     _norm_kernel->configure(compile_context, input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-Status CLBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *var,
-                                           const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon, ActivationLayerInfo act_info)
+Status CLBatchNormalizationLayer::validate(const ITensorInfo  *input,
+                                           const ITensorInfo  *output,
+                                           const ITensorInfo  *mean,
+                                           const ITensorInfo  *var,
+                                           const ITensorInfo  *beta,
+                                           const ITensorInfo  *gamma,
+                                           float               epsilon,
+                                           ActivationLayerInfo act_info)
 {
     return CLBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
@@ -69,4 +84,4 @@
 {
     CLScheduler::get().enqueue(*_norm_kernel, true);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
index d7a4091..a3798da 100644
--- a/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLBatchToSpaceLayer.cpp

@@ -30,14 +30,12 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 
-#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBatchToSpaceLayerKernel.h"
 
 namespace arm_compute
 {
-CLBatchToSpaceLayer::CLBatchToSpaceLayer()
-    : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
+CLBatchToSpaceLayer::CLBatchToSpaceLayer() : _batch_to_space_kernel(std::make_unique<CLBatchToSpaceLayerKernel>())
 {
 }
 
@@ -49,29 +47,43 @@
     _batch_to_space_kernel->configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, ICLTensor *output)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, output);
     _batch_to_space_kernel->configure(compile_context, input, block_shape, output);
 }
 
-void CLBatchToSpaceLayer::configure(const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(
+    const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, int32_t block_shape_x, int32_t block_shape_y, ICLTensor *output, const CropInfo &crop_info)
+void CLBatchToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    int32_t                 block_shape_x,
+                                    int32_t                 block_shape_y,
+                                    ICLTensor              *output,
+                                    const CropInfo         &crop_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, output);
     _batch_to_space_kernel->configure(compile_context, input, block_shape_x, block_shape_y, output, crop_info);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+CLBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     return CLBatchToSpaceLayerKernel::validate(input, block_shape, output);
 }
 
-Status CLBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status CLBatchToSpaceLayer::validate(const ITensorInfo *input,
+                                     int32_t            block_shape_x,
+                                     int32_t            block_shape_y,
+                                     const ITensorInfo *output,
+                                     const CropInfo    &crop_info)
 {
     return CLBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
 }

diff --git a/src/runtime/CL/functions/CLBitwiseAnd.cpp b/src/runtime/CL/functions/CLBitwiseAnd.cpp
index a4712ed..7bfd0e3 100644
--- a/src/runtime/CL/functions/CLBitwiseAnd.cpp
+++ b/src/runtime/CL/functions/CLBitwiseAnd.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseAnd.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,11 +35,14 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseAnd::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseAnd::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
     k->configure(compile_context, input1, input2, output, BitwiseOperation::AND);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLBitwiseNot.cpp b/src/runtime/CL/functions/CLBitwiseNot.cpp
index 5964b92..9763915 100644
--- a/src/runtime/CL/functions/CLBitwiseNot.cpp
+++ b/src/runtime/CL/functions/CLBitwiseNot.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseNot.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -43,4 +42,4 @@
     k->configure(compile_context, input, nullptr, output, BitwiseOperation::NOT);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLBitwiseOr.cpp b/src/runtime/CL/functions/CLBitwiseOr.cpp
index a07bf17..dd3171b 100644
--- a/src/runtime/CL/functions/CLBitwiseOr.cpp
+++ b/src/runtime/CL/functions/CLBitwiseOr.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseOr.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,11 +35,14 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseOr::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseOr::configure(const CLCompileContext &compile_context,
+                            const ICLTensor        *input1,
+                            const ICLTensor        *input2,
+                            ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();
     k->configure(compile_context, input1, input2, output, BitwiseOperation::OR);
     _kernel = std::move(k);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLBitwiseXor.cpp b/src/runtime/CL/functions/CLBitwiseXor.cpp
index f65e2e4..5bee4b3 100644
--- a/src/runtime/CL/functions/CLBitwiseXor.cpp
+++ b/src/runtime/CL/functions/CLBitwiseXor.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBitwiseXor.h"
 
-#include "src/core/CL/kernels/CLBitwiseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBitwiseKernel.h"
 
 #include <utility>
 
@@ -36,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLBitwiseXor::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output)
+void CLBitwiseXor::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input1,
+                             const ICLTensor        *input2,
+                             ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<CLBitwiseKernel>();

diff --git a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
index 48583bf..76e626f 100644
--- a/src/runtime/CL/functions/CLBoundingBoxTransform.cpp
+++ b/src/runtime/CL/functions/CLBoundingBoxTransform.cpp

@@ -23,18 +23,24 @@
  */
 #include "arm_compute/runtime/CL/functions/CLBoundingBoxTransform.h"
 
-#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 
 namespace arm_compute
 {
-void CLBoundingBoxTransform::configure(const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), boxes, pred_boxes, deltas, info);
 }
 
-void CLBoundingBoxTransform::configure(const CLCompileContext &compile_context, const ICLTensor *boxes, ICLTensor *pred_boxes, const ICLTensor *deltas, const BoundingBoxTransformInfo &info)
+void CLBoundingBoxTransform::configure(const CLCompileContext         &compile_context,
+                                       const ICLTensor                *boxes,
+                                       ICLTensor                      *pred_boxes,
+                                       const ICLTensor                *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
 
@@ -44,7 +50,10 @@
     _kernel = std::move(k);
 }
 
-Status CLBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status CLBoundingBoxTransform::validate(const ITensorInfo              *boxes,
+                                        const ITensorInfo              *pred_boxes,
+                                        const ITensorInfo              *deltas,
+                                        const BoundingBoxTransformInfo &info)
 {
     return CLBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
 }

diff --git a/src/runtime/CL/functions/CLCast.cpp b/src/runtime/CL/functions/CLCast.cpp
index 10f7cc2..42ec8f7 100644
--- a/src/runtime/CL/functions/CLCast.cpp
+++ b/src/runtime/CL/functions/CLCast.cpp

@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
@@ -37,16 +37,15 @@
 {
 struct CLCast::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCast> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
 };
 
-CLCast::CLCast()
-    : _impl(std::make_unique<Impl>())
+CLCast::CLCast() : _impl(std::make_unique<Impl>())
 {
 }
-CLCast::CLCast(CLCast &&) = default;
+CLCast::CLCast(CLCast &&)            = default;
 CLCast &CLCast::operator=(CLCast &&) = default;
 CLCast::~CLCast()                    = default;
 
@@ -55,7 +54,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy);
 }
 
-void CLCast::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy)
+void CLCast::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       ConvertPolicy           policy)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, policy);
@@ -74,7 +76,7 @@
 
 void CLCast::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
index 021f28f..1ee4789 100644
--- a/src/runtime/CL/functions/CLChannelShuffleLayer.cpp
+++ b/src/runtime/CL/functions/CLChannelShuffleLayer.cpp

@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLChannelShuffleLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLChannelShuffleLayerKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, num_groups);
 }
 
-void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, unsigned int num_groups)
+void CLChannelShuffleLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output,
+                                      unsigned int            num_groups)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, num_groups);
     auto k = std::make_unique<CLChannelShuffleLayerKernel>();

diff --git a/src/runtime/CL/functions/CLComparison.cpp b/src/runtime/CL/functions/CLComparison.cpp
index 192a266..2f54371 100644
--- a/src/runtime/CL/functions/CLComparison.cpp
+++ b/src/runtime/CL/functions/CLComparison.cpp

@@ -25,10 +25,10 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLComparisonKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLComparisonKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
@@ -37,25 +37,33 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, operation);
 }
 
-void CLComparison::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ComparisonOperation operation)
+void CLComparison::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output,
+                             ComparisonOperation     operation)
 {
     ARM_COMPUTE_LOG_PARAMS(input2, input2, output, operation);
     auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, operation);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
-Status CLComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation operation)
+Status CLComparison::validate(const ITensorInfo  *input1,
+                              const ITensorInfo  *input2,
+                              const ITensorInfo  *output,
+                              ComparisonOperation operation)
 {
     return CLComparisonKernel::validate(input1, input2, output, operation);
 }
@@ -67,25 +75,30 @@
 }
 
 template <ComparisonOperation COP>
-void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLComparisonStatic<COP>::configure(const CLCompileContext &compile_context,
+                                        ICLTensor              *input1,
+                                        ICLTensor              *input2,
+                                        ICLTensor              *output)
 {
     auto k = std::make_unique<CLComparisonKernel>();
     k->configure(compile_context, input1, input2, output, COP);
     _kernel = std::move(k);
 
-    if(output->info()->dimension(0) > 1)
+    if (output->info()->dimension(0) > 1)
     {
         ICLTensor *broadcasted_info = (input1->info()->dimension(0) == 1) ? input1 : input2;
 
-        if(broadcasted_info->info()->dimension(0) == 1)
+        if (broadcasted_info->info()->dimension(0) == 1)
         {
-            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(), BorderMode::REPLICATE);
+            _border_handler->configure(compile_context, broadcasted_info, _kernel->border_size(),
+                                       BorderMode::REPLICATE);
         }
     }
 }
 
 template <ComparisonOperation COP>
-Status CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status
+CLComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
 {
     return CLComparisonKernel::validate(input1, input2, output, COP);
 }

diff --git a/src/runtime/CL/functions/CLConcatenateLayer.cpp b/src/runtime/CL/functions/CLConcatenateLayer.cpp
index 0a8884f..9df1c34 100644
--- a/src/runtime/CL/functions/CLConcatenateLayer.cpp
+++ b/src/runtime/CL/functions/CLConcatenateLayer.cpp

@@ -24,24 +24,23 @@
 #include "arm_compute/runtime/CL/functions/CLConcatenateLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConcatenate.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConcatenate.h"
 
 namespace arm_compute
 {
 struct CLConcatenateLayer::Impl
 {
     std::vector<const ICLTensor *>         srcs{};
-    ICLTensor                             *dst{ nullptr };
-    unsigned int                           num_inputs{ 0 };
-    unsigned int                           axis{ 0 };
-    std::unique_ptr<opencl::ClConcatenate> op{ nullptr };
+    ICLTensor                             *dst{nullptr};
+    unsigned int                           num_inputs{0};
+    unsigned int                           axis{0};
+    std::unique_ptr<opencl::ClConcatenate> op{nullptr};
 };
 
-CLConcatenateLayer::CLConcatenateLayer()
-    : _impl(std::make_unique<Impl>())
+CLConcatenateLayer::CLConcatenateLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
@@ -56,7 +55,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), inputs_vector, output, axis);
 }
 
-void CLConcatenateLayer::configure(const CLCompileContext &compile_context, std::vector<const ICLTensor *> &inputs_vector, ICLTensor *output, size_t axis)
+void CLConcatenateLayer::configure(const CLCompileContext         &compile_context,
+                                   std::vector<const ICLTensor *> &inputs_vector,
+                                   ICLTensor                      *output,
+                                   size_t                          axis)
 {
     ARM_COMPUTE_ERROR_ON(output == nullptr);
     ARM_COMPUTE_LOG_PARAMS(inputs_vector, output, axis);
@@ -68,7 +70,7 @@
     _impl->op         = std::make_unique<opencl::ClConcatenate>();
 
     std::vector<ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+    for (unsigned int i = 0; i < inputs_vector.size(); ++i)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -76,7 +78,9 @@
     _impl->op->configure(compile_context, inputs_vector_info, _impl->dst->info(), axis);
 }
 
-Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status CLConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+                                    const ITensorInfo                      *output,
+                                    size_t                                  axis)
 {
     return opencl::ClConcatenate::validate(inputs_vector, output, axis);
 }
@@ -84,7 +88,7 @@
 void CLConcatenateLayer::run()
 {
     ITensorPack pack;
-    for(unsigned i = 0; i < _impl->num_inputs; ++i)
+    for (unsigned i = 0; i < _impl->num_inputs; ++i)
     {
         pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }

diff --git a/src/runtime/CL/functions/CLConv3D.cpp b/src/runtime/CL/functions/CLConv3D.cpp
index 729b973..9d1b368 100644
--- a/src/runtime/CL/functions/CLConv3D.cpp
+++ b/src/runtime/CL/functions/CLConv3D.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/CL/functions/CLConv3D.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/gpu/cl/operators/ClDirectConv3d.h"
 
 namespace arm_compute
@@ -32,29 +33,38 @@
 
 struct CLConv3D::Impl
 {
-    const ICLTensor                        *src{ nullptr };
-    const ICLTensor                        *weights{ nullptr };
-    const ICLTensor                        *biases{ nullptr };
-    ICLTensor                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConv3d> op{ nullptr };
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv3d> op{nullptr};
 };
 
-CLConv3D::CLConv3D()
-    : _impl(std::make_unique<Impl>())
+CLConv3D::CLConv3D() : _impl(std::make_unique<Impl>())
 {
 }
 
 CLConv3D::~CLConv3D() = default;
 
-void CLConv3D::configure(const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const ICLTensor  *src,
+                         const ICLTensor  *weights,
+                         const ICLTensor  *biases,
+                         ICLTensor        *dst,
+                         const Conv3dInfo &conv3d_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), src, weights, biases, dst, conv3d_info);
 }
 
-void CLConv3D::configure(const CLCompileContext &compile_context, const ICLTensor *src, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *dst, const Conv3dInfo &conv3d_info)
+void CLConv3D::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *src,
+                         const ICLTensor        *weights,
+                         const ICLTensor        *biases,
+                         ICLTensor              *dst,
+                         const Conv3dInfo       &conv3d_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLConv3D::validate(
+        src->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), dst->info(), conv3d_info));
 
     _impl->src     = src;
     _impl->weights = weights;
@@ -62,10 +72,15 @@
     _impl->dst     = dst;
 
     _impl->op = std::make_unique<opencl::ClDirectConv3d>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(), _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->weights->info(),
+                         _impl->biases ? _impl->biases->info() : nullptr, _impl->dst->info(), conv3d_info);
 }
 
-Status CLConv3D::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv3dInfo &conv3d_info)
+Status CLConv3D::validate(const ITensorInfo *src,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *dst,
+                          const Conv3dInfo  &conv3d_info)
 {
     return opencl::ClDirectConv3d::validate(src, weights, biases, dst, conv3d_info);
 }

diff --git a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
index b3efe5c..2298f2a 100644
--- a/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp
+++ b/src/runtime/CL/functions/CLConvertFullyConnectedWeights.cpp

@@ -27,33 +27,37 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h"
 
 namespace arm_compute
 {
 struct CLConvertFullyConnectedWeights::Impl
 {
-    const ICLTensor                                        *src{ nullptr };
-    ICLTensor                                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{ nullptr };
+    const ICLTensor                                        *src{nullptr};
+    ICLTensor                                              *dst{nullptr};
+    std::unique_ptr<opencl::ClConvertFullyConnectedWeights> op{nullptr};
 };
-CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights()
-    : _impl(std::make_unique<Impl>())
+CLConvertFullyConnectedWeights::CLConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
 {
 }
 CLConvertFullyConnectedWeights::~CLConvertFullyConnectedWeights() = default;
 
-void CLConvertFullyConnectedWeights::configure(const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const ICLTensor   *input,
+                                               ICLTensor         *output,
+                                               const TensorShape &original_input_shape,
+                                               DataLayout         data_layout)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, original_input_shape, data_layout);
 }
 
-void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void CLConvertFullyConnectedWeights::configure(const CLCompileContext &compile_context,
+                                               const ICLTensor        *input,
+                                               ICLTensor              *output,
+                                               const TensorShape      &original_input_shape,
+                                               DataLayout              data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, original_input_shape, data_layout);
@@ -63,8 +67,10 @@
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
 }
 
-Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                DataLayout data_layout)
+Status CLConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+                                                const ITensorInfo *output,
+                                                const TensorShape &original_input_shape,
+                                                DataLayout         data_layout)
 {
     return opencl::ClConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
 }

diff --git a/src/runtime/CL/functions/CLConvolutionLayer.cpp b/src/runtime/CL/functions/CLConvolutionLayer.cpp
index f3c05ad..7767b45 100644
--- a/src/runtime/CL/functions/CLConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLConvolutionLayer.cpp

@@ -28,11 +28,11 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClConv2d.h"
-
-#include "src/common/utils/Log.h"
 #include "support/Cast.h"
 
 namespace arm_compute
@@ -43,41 +43,59 @@
 {
     MemoryGroup                          memory_group{};
     std::shared_ptr<IMemoryManager>      memory_manager{};
-    std::unique_ptr<opencl::IClOperator> op{ nullptr };
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
     ITensorPack                          run_pack{};
     ITensorPack                          prep_pack{};
     WorkspaceData<CLTensor>              workspace{};
     experimental::MemoryRequirements     aux_mem_req{};
-    std::unique_ptr<IFunction>           func{ nullptr };
+    std::unique_ptr<IFunction>           func{nullptr};
 };
 
-CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+CLConvolutionLayer::CLConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_manager = std::move(memory_manager);
 }
 
 CLConvolutionLayer::~CLConvolutionLayer() = default;
 
-void CLConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, enable_fast_math, num_groups);
 }
 
-void CLConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                   const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void CLConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input,
+                                   const ICLTensor           *weights,
+                                   const ICLTensor           *biases,
+                                   ICLTensor                 *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math, num_groups));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(CLConvolutionLayer::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+        weights_info, dilation, act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
-                                                    weights_info, CLScheduler::get().target()))
+    switch (opencl::ClConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv2d_info,
+                                                     weights_info, CLScheduler::get().target()))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::DIRECT:
@@ -85,7 +103,8 @@
         case ConvolutionMethod::GEMM:
         {
             auto f = std::make_unique<opencl::ClConv2d>();
-            f->configure(compile_context, input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+            f->configure(compile_context, input->info(), weights->info(),
+                         ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
             _impl->op = std::move(f);
             break;
         }
@@ -101,40 +120,52 @@
             break;
     }
 
-    if(_impl->op)
+    if (_impl->op)
     {
         _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
         _impl->aux_mem_req  = _impl->op->workspace();
-        _impl->run_pack     = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->prep_pack    = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-        _impl->workspace    = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status CLConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status CLConvolutionLayer::validate(const ITensorInfo         *input,
+                                    const ITensorInfo         *weights,
+                                    const ITensorInfo         *biases,
+                                    const ITensorInfo         *output,
+                                    const PadStrideInfo       &conv_info,
+                                    const WeightsInfo         &weights_info,
+                                    const Size2D              &dilation,
+                                    const ActivationLayerInfo &act_info,
+                                    bool                       enable_fast_math,
+                                    unsigned int               num_groups)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((num_groups != 1) && (input->data_layout() != DataLayout::NCHW),
+                                    "Grouping (num_groups != 1) with NHWC data layout is not supported");
 
     const GPUTarget  gpu_target  = CLScheduler::get().target();
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
-    switch(opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
+    switch (opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::DIRECT:
         case ConvolutionMethod::INDIRECT:
         case ConvolutionMethod::GEMM:
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClConv2d::validate(input, weights, biases, output, conv2d_info, weights_info));
             break;
         }
         case ConvolutionMethod::FFT:
         {
             // Validate FFT-based convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLFFTConvolutionLayer::validate(input, weights, nullptr, output, conv_info,
+                                                                        act_info, enable_fast_math));
             break;
         }
         default:
@@ -145,8 +176,15 @@
     return Status{};
 }
 
-ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const ActivationLayerInfo &act_info, const GPUTarget gpu_target, const Size2D &dilation, bool enable_fast_math)
+ConvolutionMethod CLConvolutionLayer::get_convolution_method(const ITensorInfo         *input,
+                                                             const ITensorInfo         *weights,
+                                                             const ITensorInfo         *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             const WeightsInfo         &weights_info,
+                                                             const ActivationLayerInfo &act_info,
+                                                             const GPUTarget            gpu_target,
+                                                             const Size2D              &dilation,
+                                                             bool                       enable_fast_math)
 {
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, enable_fast_math, 1);
     return opencl::ClConv2d::get_convolution_method(input, weights, output, conv2d_info, weights_info, gpu_target);
@@ -158,7 +196,7 @@
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->run();
     }
@@ -170,7 +208,7 @@
 
 void CLConvolutionLayer::prepare()
 {
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->prepare();
     }

diff --git a/src/runtime/CL/functions/CLCopy.cpp b/src/runtime/CL/functions/CLCopy.cpp
index 56400b6..a4f2b06 100644
--- a/src/runtime/CL/functions/CLCopy.cpp
+++ b/src/runtime/CL/functions/CLCopy.cpp

@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCopy.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCopy.h"
 
 #include <utility>
 
@@ -38,16 +38,15 @@
 {
 struct CLCopy::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCopy> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCopy> op{nullptr};
 };
 
-CLCopy::CLCopy()
-    : _impl(std::make_unique<Impl>())
+CLCopy::CLCopy() : _impl(std::make_unique<Impl>())
 {
 }
-CLCopy::CLCopy(CLCopy &&) = default;
+CLCopy::CLCopy(CLCopy &&)            = default;
 CLCopy &CLCopy::operator=(CLCopy &&) = default;
 CLCopy::~CLCopy()                    = default;
 

diff --git a/src/runtime/CL/functions/CLCrop.cpp b/src/runtime/CL/functions/CLCrop.cpp
index 35ea17c..fc29c43 100644
--- a/src/runtime/CL/functions/CLCrop.cpp
+++ b/src/runtime/CL/functions/CLCrop.cpp

@@ -27,10 +27,10 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCrop.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCrop.h"
 
 #include <utility>
 
@@ -38,27 +38,38 @@
 {
 struct CLCrop::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCrop> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCrop> op{nullptr};
 };
 
-CLCrop::CLCrop()
-    : _impl(std::make_unique<Impl>())
+CLCrop::CLCrop() : _impl(std::make_unique<Impl>())
 {
 }
-CLCrop::CLCrop(CLCrop &&) = default;
+CLCrop::CLCrop(CLCrop &&)            = default;
 CLCrop &CLCrop::operator=(CLCrop &&) = default;
 CLCrop::~CLCrop()                    = default;
 
-void CLCrop::configure(const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void CLCrop::configure(const ICLTensor *src,
+                       ICLTensor       *dst,
+                       Coordinates2D    start,
+                       Coordinates2D    end,
+                       uint32_t         batch_index,
+                       float            extrapolation_value,
+                       Window          *dst_window)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value, dst_window);
+    configure(CLKernelLibrary::get().get_compile_context(), src, dst, start, end, batch_index, extrapolation_value,
+              dst_window);
 }
 
-void CLCrop::configure(const CLCompileContext &compile_context, const ICLTensor *src, ICLTensor *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value,
-                       Window *dst_window)
+void CLCrop::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *src,
+                       ICLTensor              *dst,
+                       Coordinates2D           start,
+                       Coordinates2D           end,
+                       uint32_t                batch_index,
+                       float                   extrapolation_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
     ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window);
@@ -67,10 +78,17 @@
     _impl->dst = dst;
 
     _impl->op = std::make_unique<opencl::ClCrop>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index, extrapolation_value, dst_window);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), start, end, batch_index,
+                         extrapolation_value, dst_window);
 }
 
-Status CLCrop::validate(const ITensorInfo *input, const ITensorInfo *output, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window)
+Status CLCrop::validate(const ITensorInfo *input,
+                        const ITensorInfo *output,
+                        Coordinates2D      start,
+                        Coordinates2D      end,
+                        uint32_t           batch_index,
+                        float              extrapolation_value,
+                        Window            *dst_window)
 {
     return opencl::ClCrop::validate(input, output, start, end, batch_index, extrapolation_value, dst_window);
 }

diff --git a/src/runtime/CL/functions/CLCropResize.cpp b/src/runtime/CL/functions/CLCropResize.cpp
index d8fc38d..821412b 100644
--- a/src/runtime/CL/functions/CLCropResize.cpp
+++ b/src/runtime/CL/functions/CLCropResize.cpp

@@ -25,19 +25,26 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 
-#include "src/common/utils/Log.h"
-
 #include <cstddef>
 
 namespace arm_compute
 {
 namespace
 {
-inline void configure_crop(const ICLTensor *input, ICLTensor *crop_boxes, ICLTensor *box_ind, ICLTensor *output, uint32_t crop_box_ind, Coordinates &start, Coordinates &end, uint32_t &batch_index)
+inline void configure_crop(const ICLTensor *input,
+                           ICLTensor       *crop_boxes,
+                           ICLTensor       *box_ind,
+                           ICLTensor       *output,
+                           uint32_t         crop_box_ind,
+                           Coordinates     &start,
+                           Coordinates     &end,
+                           uint32_t        &batch_index)
 {
     batch_index = *(reinterpret_cast<int32_t *>(box_ind->ptr_to_element(Coordinates(crop_box_ind))));
 
@@ -50,30 +57,48 @@
     // The normalized coordinates are scaled to retrieve the floating point image coordinates which are rounded to integers.
     start = Coordinates(std::floor(x0 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
                         std::floor(y0 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    end = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
-                      std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
-    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1, static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
+    end   = Coordinates(std::floor(x1 * (input->info()->tensor_shape()[1] - 1) + 0.5f),
+                        std::floor(y1 * (input->info()->tensor_shape()[2] - 1) + 0.5f));
+    const TensorShape out_shape(input->info()->tensor_shape()[0], static_cast<uint32_t>(abs(end[0] - start[0])) + 1,
+                                static_cast<uint32_t>(abs(end[1] - start[1])) + 1);
     output->info()->set_tensor_shape(out_shape);
 }
 } // namespace
 
 CLCropResize::CLCropResize()
-    : _input(nullptr), _boxes(nullptr), _box_ind(nullptr), _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _scale(), _copy(), _crop_results(), _scaled_results(), _internal_functions()
+    : _input(nullptr),
+      _boxes(nullptr),
+      _box_ind(nullptr),
+      _output(nullptr),
+      _num_boxes(0),
+      _method(),
+      _extrapolation_value(0),
+      _scale(),
+      _copy(),
+      _crop_results(),
+      _scaled_results(),
+      _internal_functions()
 {
 }
 
 CLCropResize::~CLCropResize() = default;
 
-Status CLCropResize::validate(const ITensorInfo *input, ITensorInfo *boxes, ITensorInfo *box_ind, const ITensorInfo *output,
-                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status CLCropResize::validate(const ITensorInfo  *input,
+                              ITensorInfo        *boxes,
+                              ITensorInfo        *box_ind,
+                              const ITensorInfo  *output,
+                              Coordinates2D       crop_size,
+                              InterpolationPolicy method,
+                              float               extrapolation_value)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
     ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[0] != 4);
     ARM_COMPUTE_RETURN_ERROR_ON(boxes->tensor_shape()[1] != box_ind->tensor_shape()[0]);
     TensorInfo temp_info;
-    ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, { 0, 0 }, { 1, 1 }, input->dimension(3) - 1, extrapolation_value));
-    if(output->total_size() > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLCrop::validate(input->clone().get(), &temp_info, {0, 0}, {1, 1},
+                                                 input->dimension(3) - 1, extrapolation_value));
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -83,20 +108,34 @@
     return Status{};
 }
 
-void CLCropResize::configure(const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const ICLTensor    *input,
+                             ICLTensor          *boxes,
+                             ICLTensor          *box_ind,
+                             ICLTensor          *output,
+                             Coordinates2D       crop_size,
+                             InterpolationPolicy method,
+                             float               extrapolation_value)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method, extrapolation_value);
+    configure(CLKernelLibrary::get().get_compile_context(), input, boxes, box_ind, output, crop_size, method,
+              extrapolation_value);
 }
 
-void CLCropResize::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *boxes, ICLTensor *box_ind, ICLTensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void CLCropResize::configure(const CLCompileContext &compile_context,
+                             const ICLTensor        *input,
+                             ICLTensor              *boxes,
+                             ICLTensor              *box_ind,
+                             ICLTensor              *output,
+                             Coordinates2D           crop_size,
+                             InterpolationPolicy     method,
+                             float                   extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, boxes, box_ind);
-    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(CLCropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+                                                      crop_size, method, extrapolation_value));
     ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
 
-    TensorShape output_shape = TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
+    TensorShape output_shape =
+        TensorShape(input->info()->tensor_shape()[0], crop_size.x, crop_size.y, boxes->info()->tensor_shape()[1]);
     auto_init_if_empty(*output->info(), output_shape, 1, DataType::F32);
 
     _num_boxes = boxes->info()->tensor_shape()[1];
@@ -122,7 +161,7 @@
     // kernels used for cropping and scaling.
     _boxes->map(CLScheduler::get().queue());
     _box_ind->map(CLScheduler::get().queue());
-    for(unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
+    for (unsigned int num_box = 0; num_box < _num_boxes; ++num_box)
     {
         auto       crop_tensor = std::make_unique<CLTensor>();
         TensorInfo crop_result_info(1, DataType::F32);
@@ -143,7 +182,9 @@
         configure_crop(_input, _boxes, _box_ind, _crop_results[num_box].get(), num_box, start, end, batch_index);
 
         auto scale_kernel = std::make_unique<CLScale>();
-        scale_kernel->configure(compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT });
+        scale_kernel->configure(
+            compile_context, _crop_results[num_box].get(), _scaled_results[num_box].get(),
+            ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT});
         _scale.emplace_back(std::move(scale_kernel));
 
         Window win = calculate_max_window(*_output->info());
@@ -159,28 +200,50 @@
         bool is_width_flipped  = end[0] < start[0];
         bool is_height_flipped = end[1] < start[1];
         /** The number of rows out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> rows_out_of_bounds{ 0 };
+        std::array<int32_t, 2> rows_out_of_bounds{0};
         /** The number of columns out of bounds at the start and end of _crop_results[num_box].get(). */
-        std::array<int32_t, 2> cols_out_of_bounds{ 0 };
-        if(is_height_flipped)
+        std::array<int32_t, 2> cols_out_of_bounds{0};
+        if (is_height_flipped)
         {
-            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(start[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
-            rows_out_of_bounds[1] = end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
+            rows_out_of_bounds[0] = start[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(start[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
+            rows_out_of_bounds[1] =
+                end[1] < 0 ? std::min(-end[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                           : 0;
         }
         else
         {
-            rows_out_of_bounds[0] = start[1] < 0 ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2))) : 0;
-            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2)) ? std::min(end[1] - _input->info()->dimension(2) + 1, _crop_results[num_box].get()->info()->dimension(2)) : 0;
+            rows_out_of_bounds[0] =
+                start[1] < 0
+                    ? std::min(-start[1], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)))
+                    : 0;
+            rows_out_of_bounds[1] = end[1] >= static_cast<int32_t>(_input->info()->dimension(2))
+                                        ? std::min(end[1] - _input->info()->dimension(2) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(2))
+                                        : 0;
         }
-        if(is_width_flipped)
+        if (is_width_flipped)
         {
-            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(start[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
-            cols_out_of_bounds[1] = end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
+            cols_out_of_bounds[0] = start[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(start[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
+            cols_out_of_bounds[1] =
+                end[0] < 0 ? std::min(-end[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                           : 0;
         }
         else
         {
-            cols_out_of_bounds[0] = start[0] < 0 ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1))) : 0;
-            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1)) ? std::min(end[0] - _input->info()->dimension(1) + 1, _crop_results[num_box].get()->info()->dimension(1)) : 0;
+            cols_out_of_bounds[0] =
+                start[0] < 0
+                    ? std::min(-start[0], static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)))
+                    : 0;
+            cols_out_of_bounds[1] = end[0] >= static_cast<int32_t>(_input->info()->dimension(1))
+                                        ? std::min(end[0] - _input->info()->dimension(1) + 1,
+                                                   _crop_results[num_box].get()->info()->dimension(1))
+                                        : 0;
         }
 
         Window full_window = calculate_max_window(*_crop_results[num_box].get()->info());
@@ -203,67 +266,84 @@
         // Fill all _crop_results[num_box].get() rows that have no elements that are within the input bounds
         // with the extrapolation value using memset.
         // First for the rows before the in bounds rows.
-        if(rows_out_of_bounds[0] > 0)
+        if (rows_out_of_bounds[0] > 0)
         {
             Window slice_fill_rows_before(full_window);
             slice_fill_rows_before.set(2, Window::Dimension(0, rows_out_of_bounds[0], 1));
             auto kernel = std::make_unique<CLFill>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_before);
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_before);
             //_internal_functions.emplace_back(std::move(kernel));
             _internal_functions.push_back(std::move(kernel));
         }
 
         Window slice_in(full_window);
-        slice_in.set(2, Window::Dimension(rows_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
-        slice_in.set(1, Window::Dimension(cols_out_of_bounds[0], _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
+        slice_in.set(2,
+                     Window::Dimension(rows_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], 1));
+        slice_in.set(1,
+                     Window::Dimension(cols_out_of_bounds[0],
+                                       _crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], 1));
 
-        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) - rows_out_of_bounds[0] - rows_out_of_bounds[1];
-        if(rows_in_bounds > 0)
+        int rows_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(2)) -
+                             rows_out_of_bounds[0] - rows_out_of_bounds[1];
+        if (rows_in_bounds > 0)
         {
             // Fill all elements that share a row with an in bounds element with the extrapolation value.
-            if(cols_out_of_bounds[0] > 0)
+            if (cols_out_of_bounds[0] > 0)
             {
                 Window slice_fill_cols_before(slice_in);
                 slice_fill_cols_before.set(1, Window::Dimension(0, cols_out_of_bounds[0], 1));
                 auto kernel = std::make_unique<CLFill>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_before);
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_before);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
 
-            if(cols_out_of_bounds[1] > 0)
+            if (cols_out_of_bounds[1] > 0)
             {
                 Window slice_fill_cols_after(slice_in);
-                slice_fill_cols_after.set(1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(1), 1));
+                slice_fill_cols_after.set(
+                    1, Window::Dimension(_crop_results[num_box].get()->info()->dimension(1) - cols_out_of_bounds[1],
+                                         _crop_results[num_box].get()->info()->dimension(1), 1));
                 auto kernel = std::make_unique<CLFill>();
-                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_cols_after);
+                kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                                  &slice_fill_cols_after);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
 
             // Copy all elements within the input bounds from the input tensor.
-            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) - cols_out_of_bounds[0] - cols_out_of_bounds[1];
-            if(cols_in_bounds > 0)
+            int cols_in_bounds = static_cast<int32_t>(_crop_results[num_box].get()->info()->dimension(1)) -
+                                 cols_out_of_bounds[0] - cols_out_of_bounds[1];
+            if (cols_in_bounds > 0)
             {
-                Coordinates2D start_in{ is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
-                                        is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0] };
-                Coordinates2D end_in{ is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
-                                      is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1 };
+                Coordinates2D start_in{
+                    is_width_flipped ? start[0] - cols_out_of_bounds[0] : start[0] + cols_out_of_bounds[0],
+                    is_height_flipped ? start[1] - rows_out_of_bounds[0] : start[1] + rows_out_of_bounds[0]};
+                Coordinates2D end_in{
+                    is_width_flipped ? start_in.x - cols_in_bounds + 1 : start_in.x + cols_in_bounds - 1,
+                    is_height_flipped ? start_in.y - rows_in_bounds + 1 : start_in.y + rows_in_bounds - 1};
                 auto kernel = std::make_unique<CLCrop>();
 
-                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index, extrapolation_value, &slice_in);
+                kernel->configure(compile_context, _input, _crop_results[num_box].get(), start_in, end_in, batch_index,
+                                  extrapolation_value, &slice_in);
                 //_internal_functions.emplace_back(std::move(kernel));
                 _internal_functions.push_back(std::move(kernel));
             }
         }
 
         // Fill all rows after the in bounds elements with the extrapolation value.
-        if(rows_out_of_bounds[1] > 0)
+        if (rows_out_of_bounds[1] > 0)
         {
             Window slice_fill_rows_after(full_window);
-            slice_fill_rows_after.set(2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1], _crop_results[num_box].get()->info()->dimension(2), 1));
+            slice_fill_rows_after.set(
+                2, Window::Dimension(_crop_results[num_box].get()->info()->dimension(2) - rows_out_of_bounds[1],
+                                     _crop_results[num_box].get()->info()->dimension(2), 1));
             auto kernel = std::make_unique<CLFill>();
-            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value, &slice_fill_rows_after);
+            kernel->configure(compile_context, _crop_results[num_box].get(), extrapolation_value,
+                              &slice_fill_rows_after);
             //_internal_functions.emplace_back(std::move(kernel));
             _internal_functions.push_back(std::move(kernel));
         }
@@ -277,18 +357,18 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
 
-    for(unsigned int i = 0; i < _internal_functions.size(); ++i)
+    for (unsigned int i = 0; i < _internal_functions.size(); ++i)
     {
         _internal_functions[i]->run();
     }
 
     CLScheduler::get().sync();
-    for(auto &kernel : _scale)
+    for (auto &kernel : _scale)
     {
         kernel->run();
     }
     CLScheduler::get().sync();
-    for(auto &kernel : _copy)
+    for (auto &kernel : _copy)
     {
         kernel->run();
     }

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
index 4421a18..e988ab0 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayer.cpp

@@ -25,16 +25,16 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/IClOperator.h"
 #include "src/gpu/cl/operators/ClTransposedConvolution.h"
 
-#include "src/common/utils/Log.h"
-
 #include <cmath>
 #include <memory>
 #include <tuple>
@@ -44,11 +44,11 @@
 
 struct CLDeconvolutionLayer::Impl
 {
-    const ICLTensor                     *src{ nullptr };
-    const ICLTensor                     *weights{ nullptr };
-    const ICLTensor                     *biases{ nullptr };
-    ICLTensor                           *dst{ nullptr };
-    std::unique_ptr<opencl::IClOperator> op{ nullptr };
+    const ICLTensor                     *src{nullptr};
+    const ICLTensor                     *weights{nullptr};
+    const ICLTensor                     *biases{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::IClOperator> op{nullptr};
 };
 
 CLDeconvolutionLayer::~CLDeconvolutionLayer() = default;
@@ -58,24 +58,35 @@
 {
 }
 
-void CLDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(ICLTensor           *input,
+                                     ICLTensor           *weights,
+                                     const ICLTensor     *bias,
+                                     ICLTensor           *output,
+                                     const PadStrideInfo &deconv_info,
+                                     const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info, weights_info);
 }
 
-void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info,
-                                     const WeightsInfo &weights_info)
+void CLDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *weights,
+                                     const ICLTensor        *bias,
+                                     ICLTensor              *output,
+                                     const PadStrideInfo    &deconv_info,
+                                     const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info, weights_info);
 
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(), deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input->info(), weights->info(), nullptr, output->info(),
+                                                           deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
             auto op = std::make_unique<opencl::ClTransposedConvolution>();
-            op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info);
+            op->configure(compile_context, input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr,
+                          output->info(), deconv_info);
 
             _impl->src     = input;
             _impl->weights = weights;
@@ -105,22 +116,28 @@
     }
 }
 
-Status CLDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                      const WeightsInfo &weights_info)
+Status CLDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                      const ITensorInfo   *weights,
+                                      const ITensorInfo   *bias,
+                                      ITensorInfo         *output,
+                                      const PadStrideInfo &deconv_info,
+                                      const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    switch(CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
+    switch (CLDeconvolutionLayer::get_deconvolution_method(input, weights, bias, output, deconv_info, weights_info))
     {
         case DeconvolutionMethod::DIRECT:
         {
             // Validate transposed convolution operator
-            ARM_COMPUTE_RETURN_ON_ERROR(opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                opencl::ClTransposedConvolution::validate(input, weights, bias, output, deconv_info));
             break;
         }
         case DeconvolutionMethod::UPSCALE_CONV2D:
         {
             // Validate direct convolution layer
-            ARM_COMPUTE_RETURN_ON_ERROR(CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLDirectDeconvolutionLayer::validate(input, weights, bias, output, deconv_info, weights_info));
             break;
         }
         case DeconvolutionMethod::GEMM:
@@ -137,12 +154,16 @@
     return Status{};
 }
 
-DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &deconv_info,
-                                                                   const WeightsInfo &weights_info)
+DeconvolutionMethod CLDeconvolutionLayer::get_deconvolution_method(const ITensorInfo   *input,
+                                                                   const ITensorInfo   *weights,
+                                                                   const ITensorInfo   *bias,
+                                                                   ITensorInfo         *output,
+                                                                   const PadStrideInfo &deconv_info,
+                                                                   const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_UNUSED(output, bias, weights_info);
 
-    if(is_data_type_quantized_per_channel(weights->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()))
     {
         return DeconvolutionMethod::UPSCALE_CONV2D;
     }
@@ -154,11 +175,12 @@
     const size_t idx_n = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
     const size_t ofm   = weights->tensor_shape()[idx_n];
 
-    if(weights->dimension(idx_w) != deconv_info.stride().first || weights->dimension(idx_h) != deconv_info.stride().second)
+    if (weights->dimension(idx_w) != deconv_info.stride().first ||
+        weights->dimension(idx_h) != deconv_info.stride().second)
     {
         // We observe better performance for FP32 types only when ofm <= 16.
         // A better heuristic is required for selecting the method for FP16 data types.
-        if(input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)))
+        if (input->data_layout() == DataLayout::NHWC && !((input->data_type() == DataType::F32) && (ofm > 16)))
         {
             return DeconvolutionMethod::DIRECT;
         }
@@ -175,7 +197,7 @@
 {
     prepare();
 
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
         // Optimized Operator will be used
         ITensorPack pack;
@@ -195,7 +217,7 @@
 
 void CLDeconvolutionLayer::prepare()
 {
-    if(_impl->op == nullptr)
+    if (_impl->op == nullptr)
     {
         _function->prepare();
     }

diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index 0b428f5..b92bf90 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp

@@ -27,22 +27,21 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
-#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 
 namespace arm_compute
 {
 CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
-    : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()),
-      _fill(),
-      _output(nullptr)
+    : _upsample(std::make_unique<CLDeconvolutionLayerUpsampleKernel>()), _fill(), _output(nullptr)
 {
 }
 
 CLDeconvolutionLayerUpsample::~CLDeconvolutionLayerUpsample() = default;
 
-Status CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
+Status
+CLDeconvolutionLayerUpsample::validate(const ITensorInfo *input, const ITensorInfo *output, const PadStrideInfo &info)
 {
     return CLDeconvolutionLayerUpsampleKernel::validate(input, output, info);
 }
@@ -52,13 +51,17 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PadStrideInfo &info)
+void CLDeconvolutionLayerUpsample::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             const PadStrideInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, info);
 
     _output = output;
-    _fill.configure(compile_context, _output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
+    _fill.configure(compile_context, _output,
+                    PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
     _upsample->configure(compile_context, input, _output, info);
 }
 

diff --git a/src/runtime/CL/functions/CLDepthConvertLayer.cpp b/src/runtime/CL/functions/CLDepthConvertLayer.cpp
index cac3f51..6d2fea9 100644
--- a/src/runtime/CL/functions/CLDepthConvertLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthConvertLayer.cpp

@@ -26,10 +26,10 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClCast.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClCast.h"
 
 #include <utility>
 
@@ -37,16 +37,15 @@
 {
 struct CLDepthConvertLayer::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClCast> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClCast> op{nullptr};
 };
 
-CLDepthConvertLayer::CLDepthConvertLayer()
-    : _impl(std::make_unique<Impl>())
+CLDepthConvertLayer::CLDepthConvertLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&) = default;
+CLDepthConvertLayer::CLDepthConvertLayer(CLDepthConvertLayer &&)            = default;
 CLDepthConvertLayer &CLDepthConvertLayer::operator=(CLDepthConvertLayer &&) = default;
 CLDepthConvertLayer::~CLDepthConvertLayer()                                 = default;
 
@@ -55,7 +54,11 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, policy, shift);
 }
 
-void CLDepthConvertLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, ConvertPolicy policy, uint32_t shift)
+void CLDepthConvertLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    ConvertPolicy           policy,
+                                    uint32_t                shift)
 {
     ARM_COMPUTE_UNUSED(shift);
     ARM_COMPUTE_LOG_PARAMS(input, output, policy, shift);
@@ -70,7 +73,8 @@
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), policy);
 }
 
-Status CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+CLDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
     return opencl::ClCast::validate(input, output, policy);
@@ -78,7 +82,7 @@
 
 void CLDepthConvertLayer::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
index 98531e7..9477c7f 100644
--- a/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthToSpaceLayer.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLDepthToSpaceLayer.h"
 
-#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDepthToSpaceLayerKernel.h"
 
 #include <utility>
 
@@ -36,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLDepthToSpaceLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
     auto k = std::make_unique<CLDepthToSpaceLayerKernel>();

diff --git a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
index dcb982f..873601b 100644
--- a/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDepthwiseConvolutionLayer.cpp

@@ -29,12 +29,12 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLDepthwiseConvolutionLayerNativeKernel.h"
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h"
 #include "src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 using namespace arm_compute::misc;
@@ -63,25 +63,33 @@
 
 CLDepthwiseConvolutionLayer::~CLDepthwiseConvolutionLayer() = default;
 
-void CLDepthwiseConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(ICLTensor           *input,
+                                            const ICLTensor     *weights,
+                                            const ICLTensor     *biases,
+                                            ICLTensor           *output,
+                                            const PadStrideInfo &conv_info,
+                                            unsigned int         depth_multiplier,
+                                            ActivationLayerInfo  act_info,
+                                            const Size2D        &dilation)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, depth_multiplier,
+              act_info, dilation);
 }
 
-void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases,
-                                            ICLTensor *output, const PadStrideInfo &conv_info,
-                                            unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+void CLDepthwiseConvolutionLayer::configure(const CLCompileContext &compile_context,
+                                            ICLTensor              *input,
+                                            const ICLTensor        *weights,
+                                            const ICLTensor        *biases,
+                                            ICLTensor              *output,
+                                            const PadStrideInfo    &conv_info,
+                                            unsigned int            depth_multiplier,
+                                            ActivationLayerInfo     act_info,
+                                            const Size2D           &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
-    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(input->info(),
-                                                                     weights->info(),
-                                                                     biases != nullptr ? biases->info() : nullptr,
-                                                                     output != nullptr ? output->info() : input->info(),
-                                                                     conv_info,
-                                                                     depth_multiplier,
-                                                                     act_info,
-                                                                     dilation));
+    ARM_COMPUTE_ERROR_THROW_ON(CLDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr,
+        output != nullptr ? output->info() : input->info(), conv_info, depth_multiplier, act_info, dilation));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
 
     _is_quantized     = is_data_type_quantized(input->info()->data_type());
@@ -96,7 +104,7 @@
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = output;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         _memory_group.manage(&_permuted_output);
@@ -119,10 +127,12 @@
 
     CLTensor *output_multipliers_to_use = nullptr;
     CLTensor *output_shifts_to_use      = nullptr;
-    if(_is_quantized)
+    if (_is_quantized)
     {
-        const size_t idx_c       = get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t num_filters = (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
+        const size_t idx_c =
+            get_data_layout_dimension_index(weights->info()->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t num_filters =
+            (is_data_type_quantized_per_channel(weights->info()->data_type())) ? weights->info()->dimension(idx_c) : 1;
 
         _output_multipliers.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
         _output_shifts.allocator()->init(TensorInfo(TensorShape(num_filters), 1, DataType::S32));
@@ -132,16 +142,18 @@
     }
 
     // Get the depthwise convolution compute parameters
-    auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-    const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
+    auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+    const DWCComputeKernelInfo dwc_native_compute_info =
+        t->configure(input_to_use->info(), weights_to_use->info(), conv_info, dilation, depth_multiplier);
 
-    const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
     _dwc_native_kernel->set_target(gpu_target);
     _dwc_native_kernel->configure(compile_context, input_to_use, weights_to_use, biases, output_to_use,
-                                  dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use, output_shifts_to_use);
+                                  dwc_native_compute_info, conv_kernel_info, output_multipliers_to_use,
+                                  output_shifts_to_use);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
 
@@ -151,22 +163,27 @@
         _permuted_output.allocator()->allocate();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _output_multipliers.allocator()->allocate();
         _output_shifts.allocator()->allocate();
     }
 }
 
-Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLDepthwiseConvolutionLayer::validate(const ITensorInfo   *input,
+                                             const ITensorInfo   *weights,
+                                             const ITensorInfo   *biases,
+                                             const ITensorInfo   *output,
                                              const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, ActivationLayerInfo act_info, const Size2D &dilation)
+                                             unsigned int         depth_multiplier,
+                                             ActivationLayerInfo  act_info,
+                                             const Size2D        &dilation)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
 
     const bool in_place = input == output || output == nullptr;
-    if(in_place)
+    if (in_place)
     {
         output = input;
     }
@@ -174,21 +191,23 @@
     const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
     const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
 
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) > input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) > input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) + (weights->dimension(idx_w) - 1) * (dilation.x() - 1) >
+                                input->dimension(idx_w) + conv_info.pad_left() + conv_info.pad_right());
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) + (weights->dimension(idx_h) - 1) * (dilation.y() - 1) >
+                                input->dimension(idx_h) + conv_info.pad_top() + conv_info.pad_bottom());
 
     const GPUTarget gpu_target = CLScheduler::get().target();
 
-    const ConvolutionInfo conv_kernel_info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo conv_kernel_info{conv_info, depth_multiplier, act_info, dilation};
 
     const bool needs_permute = input->data_layout() == DataLayout::NCHW;
 
     const bool is_quantized = is_data_type_quantized(input->data_type());
 
     TensorInfo output_multipliers_shifts_info(TensorInfo(TensorShape(1U), 1, DataType::S32));
-    if(is_quantized)
+    if (is_quantized)
     {
-        if(is_data_type_quantized_per_channel(weights->data_type()))
+        if (is_data_type_quantized_per_channel(weights->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
 
@@ -201,40 +220,57 @@
         }
     }
 
-    if(needs_permute)
+    if (needs_permute)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(in_place, "In-place is supported only with NHWC data layout");
         TensorShape           permuted_input_shape   = input->tensor_shape();
         TensorShape           permuted_weights_shape = weights->tensor_shape();
-        const ConvolutionInfo info{ conv_info, depth_multiplier, ActivationLayerInfo(), dilation };
-        TensorShape           permuted_output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
+        const ConvolutionInfo info{conv_info, depth_multiplier, ActivationLayerInfo(), dilation};
+        TensorShape           permuted_output_shape =
+            shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
 
         permute(permuted_input_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_weights_shape, PermutationVector(2U, 0U, 1U));
         permute(permuted_output_shape, PermutationVector(2U, 0U, 1U));
 
-        const TensorInfo permuted_input   = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_weights = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NHWC);
-        const TensorInfo permuted_output  = output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_output_shape).set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_input = input->clone()
+                                              ->set_is_resizable(true)
+                                              .reset_padding()
+                                              .set_tensor_shape(permuted_input_shape)
+                                              .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_weights = weights->clone()
+                                                ->set_is_resizable(true)
+                                                .reset_padding()
+                                                .set_tensor_shape(permuted_weights_shape)
+                                                .set_data_layout(DataLayout::NHWC);
+        const TensorInfo permuted_output = output->clone()
+                                               ->set_is_resizable(true)
+                                               .reset_padding()
+                                               .set_tensor_shape(permuted_output_shape)
+                                               .set_data_layout(DataLayout::NHWC);
 
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(input, &permuted_input, PermutationVector(2U, 0U, 1U)));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(weights, &permuted_weights, PermutationVector(2U, 0U, 1U)));
 
         // Get the depthwise convolution compute parameters
-        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(&permuted_input, &permuted_weights, conv_info, dilation, depth_multiplier);
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(&permuted_input, &permuted_weights, biases, &permuted_output,
-                                                                                      dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info, &output_multipliers_shifts_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            &permuted_input, &permuted_weights, biases, &permuted_output, dwc_native_compute_info, conv_kernel_info,
+            &output_multipliers_shifts_info, &output_multipliers_shifts_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(&permuted_output, output, PermutationVector(1U, 2U, 0U)));
     }
     else
     {
         // Get the depthwise convolution compute parameters
-        auto t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
-        const DWCComputeKernelInfo dwc_native_compute_info = t->configure(input, weights, conv_info, dilation, depth_multiplier);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
-                                                                                      &output_multipliers_shifts_info));
+        auto                       t = ClDWCNativeKernelConfigurationFactory::create(gpu_target);
+        const DWCComputeKernelInfo dwc_native_compute_info =
+            t->configure(input, weights, conv_info, dilation, depth_multiplier);
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDepthwiseConvolutionLayerNativeKernel::validate(
+            input, weights, biases, output, dwc_native_compute_info, conv_kernel_info, &output_multipliers_shifts_info,
+            &output_multipliers_shifts_info));
     }
     return Status{};
 }
@@ -245,12 +281,12 @@
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_to_nhwc.run();
     }
     CLScheduler::get().enqueue(*_dwc_native_kernel);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_to_nchw.run();
     }
@@ -258,22 +294,21 @@
 
 void CLDepthwiseConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_is_quantized)
+        if (_is_quantized)
         {
             _output_multipliers.map();
             _output_shifts.map();
-            quantization::compute_quantized_multipliers_and_shifts(_input->info(),
-                                                                   _original_weights->info(),
-                                                                   _output != nullptr ? _output->info() : _input->info(),
-                                                                   reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
-                                                                   reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
+            quantization::compute_quantized_multipliers_and_shifts(
+                _input->info(), _original_weights->info(), _output != nullptr ? _output->info() : _input->info(),
+                reinterpret_cast<int32_t *>(_output_multipliers.ptr_to_element(Coordinates(0))),
+                reinterpret_cast<int32_t *>(_output_shifts.ptr_to_element(Coordinates(0))));
             _output_multipliers.unmap();
             _output_shifts.unmap();
         }
 
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 

diff --git a/src/runtime/CL/functions/CLDequantizationLayer.cpp b/src/runtime/CL/functions/CLDequantizationLayer.cpp
index 64c6b5d..20162a0 100644
--- a/src/runtime/CL/functions/CLDequantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLDequantizationLayer.cpp

@@ -26,22 +26,21 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClDequantize.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClDequantize.h"
 
 namespace arm_compute
 {
 struct CLDequantizationLayer::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClDequantize> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClDequantize> op{nullptr};
 };
 
-CLDequantizationLayer::CLDequantizationLayer()
-    : _impl(std::make_unique<Impl>())
+CLDequantizationLayer::CLDequantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLDequantizationLayer::~CLDequantizationLayer() = default;
@@ -51,7 +50,9 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output);
 }
 
-void CLDequantizationLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output)
+void CLDequantizationLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output);
     _impl->src = input;

diff --git a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
index 752e0e4..d6dae0d 100644
--- a/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectConvolutionLayer.cpp

@@ -28,37 +28,46 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/gpu/cl/operators/ClActivation.h"
-#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClActivation.h"
+#include "src/gpu/cl/operators/ClDirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLDirectConvolutionLayer::Impl
 {
-    const ICLTensor                        *src{ nullptr };
-    const ICLTensor                        *weights{ nullptr };
-    const ICLTensor                        *biases{ nullptr };
-    ICLTensor                              *dst{ nullptr };
-    std::unique_ptr<opencl::ClDirectConv2d> op{ nullptr };
+    const ICLTensor                        *src{nullptr};
+    const ICLTensor                        *weights{nullptr};
+    const ICLTensor                        *biases{nullptr};
+    ICLTensor                              *dst{nullptr};
+    std::unique_ptr<opencl::ClDirectConv2d> op{nullptr};
 };
 
-CLDirectConvolutionLayer::CLDirectConvolutionLayer()
-    : _impl(std::make_unique<Impl>())
+CLDirectConvolutionLayer::CLDirectConvolutionLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&) = default;
+CLDirectConvolutionLayer::CLDirectConvolutionLayer(CLDirectConvolutionLayer &&)            = default;
 CLDirectConvolutionLayer &CLDirectConvolutionLayer::operator=(CLDirectConvolutionLayer &&) = default;
 CLDirectConvolutionLayer::~CLDirectConvolutionLayer()                                      = default;
 
-void CLDirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
-void CLDirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                         const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLDirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input,
+                                         const ICLTensor           *weights,
+                                         const ICLTensor           *biases,
+                                         ICLTensor                 *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -69,10 +78,15 @@
     _impl->dst     = output;
 
     _impl->op = std::make_unique<opencl::ClDirectConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
-Status CLDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLDirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *biases,
+                                          const ITensorInfo         *output,
+                                          const PadStrideInfo       &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
     return opencl::ClDirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -87,4 +101,4 @@
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index 88c3c61..3717f30 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp

@@ -26,15 +26,15 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 #include <memory>
 #include <tuple>
 
@@ -55,11 +55,16 @@
 {
 }
 
-Status CLDirectDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const PadStrideInfo &info,
-                                            const WeightsInfo &weights_info)
+Status CLDirectDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                            const ITensorInfo   *weights,
+                                            const ITensorInfo   *bias,
+                                            ITensorInfo         *output,
+                                            const PadStrideInfo &info,
+                                            const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
     const DataLayout data_layout = input->data_layout();
 
@@ -70,20 +75,22 @@
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_w) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_h) < 1);
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), info);
+    auto out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                    weights->dimension(idx_w), weights->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
-    if(input->data_type() != weights->data_type())
+    if (input->data_type() != weights->data_type())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL || !is_data_type_quantized_asymmetric(input->data_type()));
+        ARM_COMPUTE_RETURN_ERROR_ON(weights->data_type() != DataType::QSYMM8_PER_CHANNEL ||
+                                    !is_data_type_quantized_asymmetric(input->data_type()));
     }
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -102,24 +109,39 @@
     unsigned int        deconv_pad_y    = 0;
     const unsigned int  stride_x        = info.stride().first;
     const unsigned int  stride_y        = info.stride().second;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape).set_data_layout(data_layout));
+    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+                                                                                out_dims, deconv_pad_x, deconv_pad_y);
+    TensorInfo          scale_out_info(input->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(scale_out_shape)
+                                           .set_data_layout(data_layout));
     const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionLayerUpsample::validate(input, &scale_out_info, info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info));
 
     return Status{};
 }
 
-void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(ICLTensor           *input,
+                                           ICLTensor           *weights,
+                                           const ICLTensor     *bias,
+                                           ICLTensor           *output,
+                                           const PadStrideInfo &info,
+                                           const WeightsInfo   &weights_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, info, weights_info);
 }
 
-void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &info,
-                                           const WeightsInfo &weights_info)
+void CLDirectDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                           ICLTensor              *input,
+                                           ICLTensor              *weights,
+                                           const ICLTensor        *bias,
+                                           ICLTensor              *output,
+                                           const PadStrideInfo    &info,
+                                           const WeightsInfo      &weights_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, weights_info);
@@ -141,15 +163,19 @@
     _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
     _flip_weights.configure(compile_context, weights, &_weights_flipped, &_flip_axis);
 
-    auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
+    auto out_dims =
+        deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h),
+                                        weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
+    auto_init_if_empty(*output->info(),
+                       input->info()->clone()->set_tensor_shape(output_shape).set_data_layout(data_layout));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLDirectDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias == nullptr ? nullptr : bias->info(), output->info(), info));
 
     _is_prepared = weights_info.retain_internal_weights();
 
@@ -158,7 +184,8 @@
     // Find the upsampled dimensions and the padding needed for the convolution with stride 1 in order to match output shape
     unsigned int      deconv_pad_x    = 0;
     unsigned int      deconv_pad_y    = 0;
-    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+        *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
 
     unsigned int deconv_pad_left  = pad_right > pad_left ? pad_right - pad_left : 0;
     unsigned int deconv_pad_right = pad_left > pad_right ? pad_left - pad_right : 0;
@@ -179,7 +206,8 @@
     _scaled_output.allocator()->init(scale_out_info);
 
     // configure scale function
-    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+    const PadStrideInfo upsample_info(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top,
+                                      deconv_pad_bottom, DimensionRoundingType::FLOOR);
     _scale_f.configure(compile_context, input, &_scaled_output, upsample_info);
 
     // Setup the function to convolve the upscaled output
@@ -191,7 +219,7 @@
     _flip_axis.allocator()->allocate();
     _flip_axis.map(true);
     auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
-    if(weights->info()->data_layout() == DataLayout::NHWC)
+    if (weights->info()->data_layout() == DataLayout::NHWC)
     {
         axis_data[0] = 1;
         axis_data[1] = 2;
@@ -216,7 +244,7 @@
 
 void CLDirectDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
@@ -229,7 +257,7 @@
         _conv_f.prepare();
 
         // Free flipped weights
-        if(!_weights_flipped.is_used())
+        if (!_weights_flipped.is_used())
         {
             _weights_flipped.allocator()->free();
         }

diff --git a/src/runtime/CL/functions/CLElementwiseOperations.cpp b/src/runtime/CL/functions/CLElementwiseOperations.cpp
index 936b37f..d9529f0 100644
--- a/src/runtime/CL/functions/CLElementwiseOperations.cpp
+++ b/src/runtime/CL/functions/CLElementwiseOperations.cpp

@@ -26,8 +26,8 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/ICLKernel.h"
 
+#include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClAdd.h"
 #include "src/gpu/cl/operators/ClElementwiseOperations.h"
 #include "src/gpu/cl/operators/ClSub.h"
@@ -36,26 +36,30 @@
 {
 struct CLArithmeticAddition::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClAdd> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAdd> op{nullptr};
 };
 
-CLArithmeticAddition::CLArithmeticAddition()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticAddition::CLArithmeticAddition() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&) = default;
+CLArithmeticAddition::CLArithmeticAddition(CLArithmeticAddition &&)            = default;
 CLArithmeticAddition &CLArithmeticAddition::operator=(CLArithmeticAddition &&) = default;
 CLArithmeticAddition::~CLArithmeticAddition()                                  = default;
 
-void CLArithmeticAddition::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticAddition::configure(
+    ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticAddition::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticAddition::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     ConvertPolicy              policy,
                                      const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
@@ -65,7 +69,11 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
-Status CLArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticAddition::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      ConvertPolicy              policy,
+                                      const ActivationLayerInfo &act_info)
 {
     return opencl::ClAdd::validate(input1, input2, output, policy, act_info);
 }
@@ -82,26 +90,33 @@
 
 struct CLArithmeticSubtraction::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClSub> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSub> op{nullptr};
 };
 
-CLArithmeticSubtraction::CLArithmeticSubtraction()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticSubtraction::CLArithmeticSubtraction() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&) = default;
+CLArithmeticSubtraction::CLArithmeticSubtraction(CLArithmeticSubtraction &&)            = default;
 CLArithmeticSubtraction &CLArithmeticSubtraction::operator=(CLArithmeticSubtraction &&) = default;
 CLArithmeticSubtraction::~CLArithmeticSubtraction()                                     = default;
 
-void CLArithmeticSubtraction::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void CLArithmeticSubtraction::configure(const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, policy, act_info);
 }
 
-void CLArithmeticSubtraction::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, ConvertPolicy policy,
+void CLArithmeticSubtraction::configure(const CLCompileContext    &compile_context,
+                                        const ICLTensor           *input1,
+                                        const ICLTensor           *input2,
+                                        ICLTensor                 *output,
+                                        ConvertPolicy              policy,
                                         const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
@@ -111,7 +126,11 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), policy, act_info);
 }
 
-Status CLArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status CLArithmeticSubtraction::validate(const ITensorInfo         *input1,
+                                         const ITensorInfo         *input2,
+                                         const ITensorInfo         *output,
+                                         ConvertPolicy              policy,
+                                         const ActivationLayerInfo &act_info)
 {
     return opencl::ClSub::validate(input1, input2, output, policy, act_info);
 }
@@ -128,26 +147,32 @@
 
 struct CLArithmeticDivision::Impl
 {
-    const ICLTensor                               *src_0{ nullptr };
-    const ICLTensor                               *src_1{ nullptr };
-    ICLTensor                                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseDivision> op{ nullptr };
+    const ICLTensor                               *src_0{nullptr};
+    const ICLTensor                               *src_1{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseDivision> op{nullptr};
 };
 
-CLArithmeticDivision::CLArithmeticDivision()
-    : _impl(std::make_unique<Impl>())
+CLArithmeticDivision::CLArithmeticDivision() : _impl(std::make_unique<Impl>())
 {
 }
-CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&) = default;
+CLArithmeticDivision::CLArithmeticDivision(CLArithmeticDivision &&)            = default;
 CLArithmeticDivision &CLArithmeticDivision::operator=(CLArithmeticDivision &&) = default;
 CLArithmeticDivision::~CLArithmeticDivision()                                  = default;
 
-void CLArithmeticDivision::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(ICLTensor                 *input1,
+                                     ICLTensor                 *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLArithmeticDivision::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLArithmeticDivision::configure(const CLCompileContext    &compile_context,
+                                     const ICLTensor           *input1,
+                                     const ICLTensor           *input2,
+                                     ICLTensor                 *output,
+                                     const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -156,7 +181,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLArithmeticDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLArithmeticDivision::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseDivision::validate(input1, input2, output, act_info);
 }
@@ -173,26 +201,32 @@
 
 struct CLElementwiseMax::Impl
 {
-    const ICLTensor                          *src_0{ nullptr };
-    const ICLTensor                          *src_1{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseMax> op{ nullptr };
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMax> op{nullptr};
 };
 
-CLElementwiseMax::CLElementwiseMax()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseMax::CLElementwiseMax() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&) = default;
+CLElementwiseMax::CLElementwiseMax(CLElementwiseMax &&)            = default;
 CLElementwiseMax &CLElementwiseMax::operator=(CLElementwiseMax &&) = default;
 CLElementwiseMax::~CLElementwiseMax()                              = default;
 
-void CLElementwiseMax::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMax::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMax::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -201,7 +235,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMax::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseMax::validate(input1, input2, output, act_info);
 }
@@ -218,26 +255,32 @@
 
 struct CLElementwiseMin::Impl
 {
-    const ICLTensor                          *src_0{ nullptr };
-    const ICLTensor                          *src_1{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseMin> op{ nullptr };
+    const ICLTensor                          *src_0{nullptr};
+    const ICLTensor                          *src_1{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseMin> op{nullptr};
 };
 
-CLElementwiseMin::CLElementwiseMin()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseMin::CLElementwiseMin() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&) = default;
+CLElementwiseMin::CLElementwiseMin(CLElementwiseMin &&)            = default;
 CLElementwiseMin &CLElementwiseMin::operator=(CLElementwiseMin &&) = default;
 CLElementwiseMin::~CLElementwiseMin()                              = default;
 
-void CLElementwiseMin::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseMin::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseMin::configure(const CLCompileContext    &compile_context,
+                                 ICLTensor                 *input1,
+                                 ICLTensor                 *input2,
+                                 ICLTensor                 *output,
+                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -246,7 +289,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseMin::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseMin::validate(input1, input2, output, act_info);
 }
@@ -263,26 +309,32 @@
 
 struct CLElementwiseSquaredDiff::Impl
 {
-    const ICLTensor                                  *src_0{ nullptr };
-    const ICLTensor                                  *src_1{ nullptr };
-    ICLTensor                                        *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{ nullptr };
+    const ICLTensor                                  *src_0{nullptr};
+    const ICLTensor                                  *src_1{nullptr};
+    ICLTensor                                        *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwiseSquaredDiff> op{nullptr};
 };
 
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff()
-    : _impl(std::make_unique<Impl>())
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&) = default;
+CLElementwiseSquaredDiff::CLElementwiseSquaredDiff(CLElementwiseSquaredDiff &&)            = default;
 CLElementwiseSquaredDiff &CLElementwiseSquaredDiff::operator=(CLElementwiseSquaredDiff &&) = default;
 CLElementwiseSquaredDiff::~CLElementwiseSquaredDiff()                                      = default;
 
-void CLElementwiseSquaredDiff::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwiseSquaredDiff::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwiseSquaredDiff::configure(const CLCompileContext    &compile_context,
+                                         ICLTensor                 *input1,
+                                         ICLTensor                 *input2,
+                                         ICLTensor                 *output,
+                                         const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -291,7 +343,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwiseSquaredDiff::validate(const ITensorInfo         *input1,
+                                          const ITensorInfo         *input2,
+                                          const ITensorInfo         *output,
+                                          const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwiseSquaredDiff::validate(input1, input2, output, act_info);
 }
@@ -308,26 +363,32 @@
 
 struct CLElementwisePower::Impl
 {
-    const ICLTensor                            *src_0{ nullptr };
-    const ICLTensor                            *src_1{ nullptr };
-    ICLTensor                                  *dst{ nullptr };
-    std::unique_ptr<opencl::ClElementwisePower> op{ nullptr };
+    const ICLTensor                            *src_0{nullptr};
+    const ICLTensor                            *src_1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<opencl::ClElementwisePower> op{nullptr};
 };
 
-CLElementwisePower::CLElementwisePower()
-    : _impl(std::make_unique<Impl>())
+CLElementwisePower::CLElementwisePower() : _impl(std::make_unique<Impl>())
 {
 }
-CLElementwisePower::CLElementwisePower(CLElementwisePower &&) = default;
+CLElementwisePower::CLElementwisePower(CLElementwisePower &&)            = default;
 CLElementwisePower &CLElementwisePower::operator=(CLElementwisePower &&) = default;
 CLElementwisePower::~CLElementwisePower()                                = default;
 
-void CLElementwisePower::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLElementwisePower::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLElementwisePower::configure(const CLCompileContext    &compile_context,
+                                   ICLTensor                 *input1,
+                                   ICLTensor                 *input2,
+                                   ICLTensor                 *output,
+                                   const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -336,7 +397,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLElementwisePower::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *output,
+                                    const ActivationLayerInfo &act_info)
 {
     return opencl::ClElementwisePower::validate(input1, input2, output, act_info);
 }

diff --git a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
index 9dcd2d1..3043c26 100644
--- a/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp
+++ b/src/runtime/CL/functions/CLElementwiseUnaryLayer.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClElementwiseUnary.h"
 
@@ -32,17 +33,16 @@
 {
 struct CLRsqrtLayer::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClRsqrt> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRsqrt> op{nullptr};
 };
 
-CLRsqrtLayer::CLRsqrtLayer()
-    : _impl(std::make_unique<Impl>())
+CLRsqrtLayer::CLRsqrtLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&) = default;
+CLRsqrtLayer::CLRsqrtLayer(CLRsqrtLayer &&)            = default;
 CLRsqrtLayer &CLRsqrtLayer::operator=(CLRsqrtLayer &&) = default;
 CLRsqrtLayer::~CLRsqrtLayer()                          = default;
 
@@ -74,17 +74,16 @@
 
 struct CLExpLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClExp> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClExp> op{nullptr};
 };
 
-CLExpLayer::CLExpLayer()
-    : _impl(std::make_unique<Impl>())
+CLExpLayer::CLExpLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLExpLayer::CLExpLayer(CLExpLayer &&) = default;
+CLExpLayer::CLExpLayer(CLExpLayer &&)            = default;
 CLExpLayer &CLExpLayer::operator=(CLExpLayer &&) = default;
 CLExpLayer::~CLExpLayer()                        = default;
 
@@ -116,17 +115,16 @@
 
 struct CLNegLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClNeg> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClNeg> op{nullptr};
 };
 
-CLNegLayer::CLNegLayer()
-    : _impl(std::make_unique<Impl>())
+CLNegLayer::CLNegLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLNegLayer::CLNegLayer(CLNegLayer &&) = default;
+CLNegLayer::CLNegLayer(CLNegLayer &&)            = default;
 CLNegLayer &CLNegLayer::operator=(CLNegLayer &&) = default;
 CLNegLayer::~CLNegLayer()                        = default;
 
@@ -157,17 +155,16 @@
 
 struct CLSinLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClSin> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClSin> op{nullptr};
 };
 
-CLSinLayer::CLSinLayer()
-    : _impl(std::make_unique<Impl>())
+CLSinLayer::CLSinLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLSinLayer::CLSinLayer(CLSinLayer &&) = default;
+CLSinLayer::CLSinLayer(CLSinLayer &&)            = default;
 CLSinLayer &CLSinLayer::operator=(CLSinLayer &&) = default;
 CLSinLayer::~CLSinLayer()                        = default;
 
@@ -198,17 +195,16 @@
 
 struct CLAbsLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClAbs> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClAbs> op{nullptr};
 };
 
-CLAbsLayer::CLAbsLayer()
-    : _impl(std::make_unique<Impl>())
+CLAbsLayer::CLAbsLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLAbsLayer::CLAbsLayer(CLAbsLayer &&) = default;
+CLAbsLayer::CLAbsLayer(CLAbsLayer &&)            = default;
 CLAbsLayer &CLAbsLayer::operator=(CLAbsLayer &&) = default;
 CLAbsLayer::~CLAbsLayer()                        = default;
 
@@ -239,17 +235,16 @@
 
 struct CLLogLayer::Impl
 {
-    const ICLTensor               *src{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClLog> op{ nullptr };
+    const ICLTensor               *src{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClLog> op{nullptr};
 };
 
-CLLogLayer::CLLogLayer()
-    : _impl(std::make_unique<Impl>())
+CLLogLayer::CLLogLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLLogLayer::CLLogLayer(CLLogLayer &&) = default;
+CLLogLayer::CLLogLayer(CLLogLayer &&)            = default;
 CLLogLayer &CLLogLayer::operator=(CLLogLayer &&) = default;
 CLLogLayer::~CLLogLayer()                        = default;
 
@@ -280,17 +275,16 @@
 
 struct CLRoundLayer::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClRound> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClRound> op{nullptr};
 };
 
-CLRoundLayer::CLRoundLayer()
-    : _impl(std::make_unique<Impl>())
+CLRoundLayer::CLRoundLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLRoundLayer::CLRoundLayer(CLRoundLayer &&) = default;
+CLRoundLayer::CLRoundLayer(CLRoundLayer &&)            = default;
 CLRoundLayer &CLRoundLayer::operator=(CLRoundLayer &&) = default;
 CLRoundLayer::~CLRoundLayer()                          = default;
 

diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp
index bd0966b..48e9ae8 100644
--- a/src/runtime/CL/functions/CLFFT1D.cpp
+++ b/src/runtime/CL/functions/CLFFT1D.cpp

@@ -26,13 +26,13 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 #include "src/core/utils/helpers/fft.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLFFT1D::CLFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,7 +54,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT1D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config)
+void CLFFT1D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT1DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config));
@@ -77,13 +80,14 @@
     TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32);
     _digit_reverse_indices.allocator()->init(digit_reverse_indices_info);
     _memory_group.manage(&_digit_reversed_input);
-    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices, digit_reverse_config);
+    _digit_reverse_kernel->configure(compile_context, input, &_digit_reversed_input, &_digit_reverse_indices,
+                                     digit_reverse_config);
 
     // Create and configure FFT kernels
     unsigned int Nx = 1;
     _num_ffts       = decomposed_vector.size();
     _fft_kernels.reserve(_num_ffts);
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
 
@@ -93,18 +97,20 @@
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
         _fft_kernels.emplace_back(std::make_unique<CLFFTRadixStageKernel>());
-        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels.back()->configure(compile_context, &_digit_reversed_input,
+                                       ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
 
     // Configure scale kernel
-    if(_run_scale)
+    if (_run_scale)
     {
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
-        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(compile_context, &_digit_reversed_input, output, scale_config)
+               : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -123,7 +129,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() != 1 && input->num_channels() != 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
 
     // Check if FFT is decomposable
     const auto         supported_radix   = CLFFTRadixStageKernel::supported_radix();
@@ -132,7 +138,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() != 1 && output->num_channels() != 2);
@@ -151,13 +157,13 @@
     CLScheduler::get().enqueue(*_digit_reverse_kernel, false);
 
     // Run radix kernels
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         CLScheduler::get().enqueue(*_fft_kernels[i], i == (_num_ffts - 1) && !_run_scale);
     }
 
     // Run output scaling
-    if(_run_scale)
+    if (_run_scale)
     {
         CLScheduler::get().enqueue(*_scale_kernel, true);
     }

diff --git a/src/runtime/CL/functions/CLFFT2D.cpp b/src/runtime/CL/functions/CLFFT2D.cpp
index 94fc411..3857046 100644
--- a/src/runtime/CL/functions/CLFFT2D.cpp
+++ b/src/runtime/CL/functions/CLFFT2D.cpp

@@ -26,16 +26,19 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLFFT2D::CLFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+    : _memory_group(memory_manager),
+      _first_pass_func(memory_manager),
+      _second_pass_func(memory_manager),
+      _first_pass_tensor()
 {
 }
 
@@ -46,7 +49,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, config);
 }
 
-void CLFFT2D::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
+void CLFFT2D::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const FFT2DInfo        &config)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_ERROR_THROW_ON(CLFFT2D::validate(input->info(), output->info(), config));
@@ -88,7 +94,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLFFT1D::validate(&first_pass_tensor, output, second_pass_config));
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);

diff --git a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
index d12e2de..3894b10 100644
--- a/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLFFTConvolutionLayer.cpp

@@ -25,10 +25,12 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CPP/CPPScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFFTDigitReverseKernel.h"
 #include "src/core/CL/kernels/CLFFTRadixStageKernel.h"
 #include "src/core/CL/kernels/CLFFTScaleKernel.h"
@@ -38,8 +40,6 @@
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/fft.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
@@ -50,11 +50,11 @@
 
     int  pad           = 0;
     bool is_decomposed = false;
-    while(!is_decomposed)
+    while (!is_decomposed)
     {
         const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
         is_decomposed                = !decomposed_vector.empty();
-        if(!is_decomposed)
+        if (!is_decomposed)
         {
             ++pad;
         }
@@ -104,17 +104,31 @@
 {
 }
 
-void CLFFTConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLFFTConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void CLFFTConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                      ICLTensor                 *input,
+                                      const ICLTensor           *weights,
+                                      const ICLTensor           *biases,
+                                      ICLTensor                 *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
-    ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math));
+    ARM_COMPUTE_ERROR_THROW_ON(CLFFTConvolutionLayer::validate(input->info(), weights->info(),
+                                                               biases != nullptr ? biases->info() : nullptr,
+                                                               output->info(), conv_info, act_info, enable_fast_math));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 
     _original_weights = weights;
@@ -124,21 +138,24 @@
     _has_bias = biases != nullptr;
 
     // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
-                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    const Size2D input_dims =
+        Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size =
+        Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                    pad_decomposable(input_dims.y() + kernel_size.y() - 1));
     // Tensors to use
     ICLTensor       *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
     ICLTensor       *output_to_use  = _has_bias ? &_bias_output : output;
 
     // Permute bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         _permute_bias_func.configure(compile_context, biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -146,7 +163,7 @@
 
     // Permute input if needed
     _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -167,7 +184,7 @@
     _flip_weights_func.configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis);
 
     // Pad weights
-    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
     _pad_weights_func.configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
@@ -175,10 +192,10 @@
     _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
-    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
     _memory_group.manage(&_padded_input);
     _pad_input_func.configure(compile_context, input_to_use, &_padded_input, padding_in);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -202,7 +219,8 @@
     _memory_group.manage(&_itransformed_output);
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
-    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransformed_output.allocator()->init(
+        _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
     _itransform_output_func.configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
@@ -214,25 +232,28 @@
     // Extract correct region
     const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
     const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
-    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
-    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
-    if(_has_bias)
+    const int end_right =
+        _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton =
+        _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if (_has_bias)
     {
         _memory_group.manage(&_bias_output);
     }
-    else if(_needs_permute)
+    else if (_needs_permute)
     {
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(compile_context, &_reshaped_output, output_to_use,
+                                   Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
     _itransformed_output.allocator()->allocate();
 
     // Add bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         output_to_use = output;
-        if(_needs_permute)
+        if (_needs_permute)
         {
             output_to_use = &_permuted_output;
             _memory_group.manage(&_permuted_output);
@@ -243,7 +264,7 @@
     }
 
     // Permute output
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -255,7 +276,7 @@
 
     // Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.configure(compile_context, output, nullptr, act_info);
     }
@@ -269,8 +290,13 @@
     _flip_axis.unmap();
 }
 
-Status CLFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                       const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLFFTConvolutionLayer::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON((input->data_type() == DataType::F16) && !enable_fast_math);
@@ -287,24 +313,27 @@
     const auto strides = conv_info.stride();
     ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+                                conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+                                conv_info.pad_bottom() != (kernel_size.y() / 2));
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
         ARM_COMPUTE_RETURN_ERROR_ON(weights->tensor_shape()[3] != biases->tensor_shape().x());
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+                                    (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
 
         // Validate Activation Layer
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output, nullptr, act_info));
         }
@@ -320,7 +349,7 @@
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Transform input
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_func.run();
     }
@@ -336,17 +365,17 @@
     _reshaped_output.allocator()->import_memory(_itransformed_output.cl_buffer());
     _extract_output_func.run();
     // Add bias
-    if(_has_bias)
+    if (_has_bias)
     {
         _bias_add_func.run();
     }
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_func.run();
     }
 
     // Run activation layer
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.run();
     }
@@ -354,10 +383,10 @@
 
 void CLFFTConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Permute bias to NCHW
-        if(_original_bias != nullptr)
+        if (_original_bias != nullptr)
         {
             _permuted_bias.allocator()->allocate();
             _permute_bias_func.run();
@@ -366,7 +395,7 @@
 
         const ICLTensor *cur_weights = _original_weights;
         // Permute weights
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
 

diff --git a/src/runtime/CL/functions/CLFill.cpp b/src/runtime/CL/functions/CLFill.cpp
index 6019a84..9bd96a9 100644
--- a/src/runtime/CL/functions/CLFill.cpp
+++ b/src/runtime/CL/functions/CLFill.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClFill.h"
 
@@ -36,16 +37,15 @@
 {
 struct CLFill::Impl
 {
-    const ICLTensor                *src{ nullptr };
-    ICLTensor                      *dst{ nullptr };
-    std::unique_ptr<opencl::ClFill> op{ nullptr };
+    const ICLTensor                *src{nullptr};
+    ICLTensor                      *dst{nullptr};
+    std::unique_ptr<opencl::ClFill> op{nullptr};
 };
 
-CLFill::CLFill()
-    : _impl(std::make_unique<Impl>())
+CLFill::CLFill() : _impl(std::make_unique<Impl>())
 {
 }
-CLFill::CLFill(CLFill &&) = default;
+CLFill::CLFill(CLFill &&)            = default;
 CLFill &CLFill::operator=(CLFill &&) = default;
 CLFill::~CLFill()                    = default;
 
@@ -54,7 +54,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), tensor, constant_value, dst_window);
 }
 
-void CLFill::configure(const CLCompileContext &compile_context, ICLTensor *tensor, const PixelValue &constant_value, Window *dst_window)
+void CLFill::configure(const CLCompileContext &compile_context,
+                       ICLTensor              *tensor,
+                       const PixelValue       &constant_value,
+                       Window                 *dst_window)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(tensor);
 

diff --git a/src/runtime/CL/functions/CLFlattenLayer.cpp b/src/runtime/CL/functions/CLFlattenLayer.cpp
index 32fc375..ba1b537 100644
--- a/src/runtime/CL/functions/CLFlattenLayer.cpp
+++ b/src/runtime/CL/functions/CLFlattenLayer.cpp

@@ -26,8 +26,9 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/gpu/cl/operators/ClFlatten.h"
@@ -36,16 +37,15 @@
 {
 struct CLFlattenLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClFlatten> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClFlatten> op{nullptr};
 };
 
-CLFlattenLayer::CLFlattenLayer()
-    : _impl(std::make_unique<Impl>())
+CLFlattenLayer::CLFlattenLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&) = default;
+CLFlattenLayer::CLFlattenLayer(CLFlattenLayer &&)            = default;
 CLFlattenLayer &CLFlattenLayer::operator=(CLFlattenLayer &&) = default;
 CLFlattenLayer::~CLFlattenLayer()                            = default;
 
@@ -59,7 +59,8 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _impl->src = input;
     _impl->dst = output;
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+                                            misc::shape_calculator::compute_flatten_shape(input->info())));
 
     _impl->op = std::make_unique<opencl::ClFlatten>();
     _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info());
@@ -68,9 +69,10 @@
 Status CLFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
     }
     return opencl::ClFlatten::validate(input, output);
@@ -83,4 +85,4 @@
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLFloor.cpp b/src/runtime/CL/functions/CLFloor.cpp
index 8739e18..4322219 100644
--- a/src/runtime/CL/functions/CLFloor.cpp
+++ b/src/runtime/CL/functions/CLFloor.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClFloor.h"
 
@@ -34,16 +35,15 @@
 {
 struct CLFloor::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClFloor> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClFloor> op{nullptr};
 };
 
-CLFloor::CLFloor()
-    : _impl(std::make_unique<Impl>())
+CLFloor::CLFloor() : _impl(std::make_unique<Impl>())
 {
 }
-CLFloor::CLFloor(CLFloor &&) = default;
+CLFloor::CLFloor(CLFloor &&)            = default;
 CLFloor &CLFloor::operator=(CLFloor &&) = default;
 CLFloor::~CLFloor()                     = default;
 

diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index 1c162db..b30f9e7 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClFullyConnected.h"
 
@@ -35,21 +36,22 @@
 struct CLFullyConnectedLayer::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<opencl::ClFullyConnected> op{ nullptr };
+    std::unique_ptr<opencl::ClFullyConnected> op{nullptr};
 
-    const ITensor *original_weights{ nullptr };
+    const ITensor *original_weights{nullptr};
 
     ITensorPack                      run_pack{};
     WorkspaceData<CLTensor>          workspace{};
     experimental::MemoryRequirements aux_mem_req{};
 
-    bool is_prepared{ false };
-    bool dynamic_weights{ false };
+    bool is_prepared{false};
+    bool dynamic_weights{false};
 };
 
-CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLFullyConnectedLayer::CLFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                             IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(std::move(memory_manager));
@@ -58,39 +60,45 @@
 
 CLFullyConnectedLayer::~CLFullyConnectedLayer() = default;
 
-void CLFullyConnectedLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, fc_info);
 }
 
-void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLFullyConnectedLayer::configure(const CLCompileContext &compile_context,
+                                      const ICLTensor        *input,
+                                      const ICLTensor        *weights,
+                                      const ICLTensor        *biases,
+                                      ICLTensor              *output,
                                       FullyConnectedLayerInfo fc_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(input->info(),
-                                                               weights->info(),
-                                                               biases != nullptr ? biases->info() : nullptr,
-                                                               output->info(),
-                                                               fc_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLFullyConnectedLayer::validate(
+        input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), fc_info));
 
     _impl->op               = std::make_unique<opencl::ClFullyConnected>();
     _impl->original_weights = weights;
     _impl->is_prepared      = fc_info.retain_internal_weights;
 
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info);
 
-    if(_impl->weights_manager != nullptr)
+    if (_impl->weights_manager != nullptr)
     {
         _impl->weights_manager->manage(_impl->original_weights);
     }
 
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->aux_mem_req = _impl->op->workspace();
-        _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->workspace   = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->workspace =
+            manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
     else
     {
@@ -98,14 +106,14 @@
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
 
-    _impl->dynamic_weights =
-        !weights->info()->are_values_constant() &&
-        fc_info.transpose_weights &&
-        !fc_info.are_weights_reshaped &&
-        !fc_info.retain_internal_weights;
+    _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+                             !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
 }
 
-Status CLFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status CLFullyConnectedLayer::validate(const ITensorInfo      *input,
+                                       const ITensorInfo      *weights,
+                                       const ITensorInfo      *biases,
+                                       const ITensorInfo      *output,
                                        FullyConnectedLayerInfo fc_info)
 {
     return opencl::ClFullyConnected::validate(input, weights, biases, output, fc_info);
@@ -113,7 +121,7 @@
 
 void CLFullyConnectedLayer::run()
 {
-    if(!_impl->dynamic_weights)
+    if (!_impl->dynamic_weights)
     {
         prepare();
     }
@@ -124,7 +132,7 @@
 
 void CLFullyConnectedLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -133,13 +141,13 @@
         _impl->is_prepared = true;
 
         // Handle weights managed infrastructure
-        if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
+        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
         {
             // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
             // This is for cases where multiple functions share the same b (weights)
             // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
             const ITensor *original_b = _impl->original_weights;
-            if(!original_b->is_used())
+            if (!original_b->is_used())
             {
                 _impl->weights_manager->pre_mark_as_unused(original_b);
             }

diff --git a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
index 7379e9d..e4fbf78 100644
--- a/src/runtime/CL/functions/CLFuseBatchNormalization.cpp
+++ b/src/runtime/CL/functions/CLFuseBatchNormalization.cpp

@@ -28,9 +28,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFuseBatchNormalizationKernel.h"
 
 namespace arm_compute
 {
@@ -41,29 +41,52 @@
 
 CLFuseBatchNormalization::~CLFuseBatchNormalization() = default;
 
-void CLFuseBatchNormalization::configure(const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    configure(CLKernelLibrary::get().get_compile_context(), input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+              input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-void CLFuseBatchNormalization::configure(const CLCompileContext &compile_context, const ICLTensor *input_weights, const ICLTensor *bn_mean, const ICLTensor *bn_var,
-                                         ICLTensor *fused_weights, ICLTensor *fused_bias,
-                                         const ICLTensor *input_bias, const ICLTensor *bn_beta, const ICLTensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void CLFuseBatchNormalization::configure(const CLCompileContext    &compile_context,
+                                         const ICLTensor           *input_weights,
+                                         const ICLTensor           *bn_mean,
+                                         const ICLTensor           *bn_var,
+                                         ICLTensor                 *fused_weights,
+                                         ICLTensor                 *fused_bias,
+                                         const ICLTensor           *input_bias,
+                                         const ICLTensor           *bn_beta,
+                                         const ICLTensor           *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
-    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                           epsilon, fbn_type);
+    _fuse_bn_kernel->configure(compile_context, input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
+                               bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
-Status CLFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status CLFuseBatchNormalization::validate(const ITensorInfo         *input_weights,
+                                          const ITensorInfo         *bn_mean,
+                                          const ITensorInfo         *bn_var,
+                                          const ITensorInfo         *fused_weights,
+                                          const ITensorInfo         *fused_bias,
+                                          const ITensorInfo         *input_bias,
+                                          const ITensorInfo         *bn_beta,
+                                          const ITensorInfo         *bn_gamma,
+                                          float                      epsilon,
+                                          FuseBatchNormalizationType fbn_type)
 {
-    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    return CLFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                    input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void CLFuseBatchNormalization::run()

diff --git a/src/runtime/CL/functions/CLGEMM.cpp b/src/runtime/CL/functions/CLGEMM.cpp
index 427ea51..871a1d6 100644
--- a/src/runtime/CL/functions/CLGEMM.cpp
+++ b/src/runtime/CL/functions/CLGEMM.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemm.h"
 
@@ -40,15 +41,15 @@
 
 struct CLGEMM::Impl
 {
-    const ICLTensor              *b{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
-    IWeightsManager              *weights_manager{ nullptr };
+    IWeightsManager              *weights_manager{nullptr};
     ITensorPack                   run_pack{};
     ITensorPack                   prep_pack{};
     MemoryRequirements            aux_mem_req{};
     WorkspaceData<CLTensor>       workspace_tensors{};
-    bool                          is_prepared{ false };
+    bool                          is_prepared{false};
 };
 
 CLGEMM::CLGEMM(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
@@ -60,12 +61,25 @@
 
 CLGEMM::~CLGEMM() = default;
 
-void CLGEMM::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const ICLTensor *a,
+                       const ICLTensor *b,
+                       const ICLTensor *c,
+                       ICLTensor       *output,
+                       float            alpha,
+                       float            beta,
+                       const GEMMInfo  &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, alpha, beta, gemm_info);
 }
 
-void CLGEMM::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info)
+void CLGEMM::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *a,
+                       const ICLTensor        *b,
+                       const ICLTensor        *c,
+                       ICLTensor              *output,
+                       float                   alpha,
+                       float                   beta,
+                       const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -73,25 +87,33 @@
     _impl->op          = std::make_unique<OperatorType>();
     _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), alpha, beta, gemm_info);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         alpha, beta, gemm_info);
     _impl->aux_mem_req = _impl->op->workspace();
 
     // Manage/allocate auxilairy tensors
-    if(_impl->is_prepared)
+    if (_impl->is_prepared)
     {
         _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _impl->run_pack  = { { ACL_SRC_0, a }, { ACL_SRC_2, c }, { ACL_DST, output } };
-        _impl->prep_pack = { { ACL_SRC_1, _impl->b } };
+        _impl->run_pack  = {{ACL_SRC_0, a}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->prep_pack = {{ACL_SRC_1, _impl->b}};
 
-        _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status CLGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status CLGEMM::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     return OperatorType::validate(a, b, c, output, alpha, beta, gemm_info);
 }
@@ -107,15 +129,15 @@
 
 void CLGEMM::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->b->mark_as_unused();
         }

diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index c8c18f3..aef7cdd 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp

@@ -27,10 +27,11 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Size2D.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmConv2d.h"
 #include "support/Cast.h"
@@ -47,18 +48,19 @@
 
 struct CLGEMMConvolutionLayer::Impl
 {
-    const ITensor                        *weights{ nullptr };
-    std::unique_ptr<opencl::ClGemmConv2d> op{ nullptr };
+    const ITensor                        *weights{nullptr};
+    std::unique_ptr<opencl::ClGemmConv2d> op{nullptr};
     ITensorPack                           run_pack{};
     ITensorPack                           prep_pack{};
     MemoryGroup                           memory_group{};
-    IWeightsManager                      *weights_manager{ nullptr };
+    IWeightsManager                      *weights_manager{nullptr};
     MemoryRequirements                    aux_mem_req{};
     WorkspaceData<CLTensor>               workspace_tensors{};
-    bool                                  is_prepared{ false };
+    bool                                  is_prepared{false};
 };
 
-CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+CLGEMMConvolutionLayer::CLGEMMConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                               IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(memory_manager);
@@ -67,40 +69,60 @@
 
 CLGEMMConvolutionLayer::~CLGEMMConvolutionLayer() = default;
 
-void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, weights_info,
+              dilation, act_info, num_groups);
 }
 
-void CLGEMMConvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                       const PadStrideInfo &conv_info,
-                                       const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+void CLGEMMConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                       const ICLTensor           *input,
+                                       const ICLTensor           *weights,
+                                       const ICLTensor           *biases,
+                                       ICLTensor                 *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     _impl->weights               = weights;
     _impl->op                    = std::make_unique<opencl::ClGemmConv2d>();
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv2d_info, weights_info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input },
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-        { TensorType::ACL_DST, output }
+    _impl->run_pack  = {{TensorType::ACL_SRC_0, input},
+                        {TensorType::ACL_SRC_1, weights},
+                        {TensorType::ACL_SRC_2, biases},
+                        {TensorType::ACL_DST, output}};
+    _impl->prep_pack = {
+        {TensorType::ACL_SRC_1, weights},
+        {TensorType::ACL_SRC_2, biases},
     };
-    _impl->prep_pack =
-    {
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-    };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+Status CLGEMMConvolutionLayer::validate(const ITensorInfo         *input,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        const ITensorInfo         *output,
+                                        const PadStrideInfo       &conv_info,
+                                        const WeightsInfo         &weights_info,
+                                        const Size2D              &dilation,
+                                        const ActivationLayerInfo &act_info,
+                                        unsigned int               num_groups)
 {
     const Conv2dInfo conv2d_info = Conv2dInfo(conv_info, dilation, act_info, false, num_groups);
     return opencl::ClGemmConv2d::validate(input, weights, biases, output, conv2d_info, weights_info);
@@ -115,14 +137,14 @@
 
 void CLGEMMConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->weights->mark_as_unused();
         }

diff --git a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
index 9fc81c1..7d40cf1 100644
--- a/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMDeconvolutionLayer.cpp

@@ -24,15 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLGEMMDeconvolutionLayer.h"
 
 #include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLDeconvolutionReshapeOutputKernel.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include <tuple>
 
@@ -40,12 +40,13 @@
 {
 namespace
 {
-std::pair<Coordinates, Coordinates> compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
+std::pair<Coordinates, Coordinates>
+compute_start_end_slice_coordinates(const ITensorInfo &output_info, const PadStrideInfo &deconv_info, bool is_nchw)
 {
     Coordinates start;
     Coordinates end;
 
-    if(is_nchw)
+    if (is_nchw)
     {
         start.set(0, deconv_info.pad_left());
         start.set(1, deconv_info.pad_top());
@@ -63,13 +64,16 @@
         end.set(2, output_info.dimension(2) - deconv_info.pad_bottom());
     }
 
-    return { start, end };
+    return {start, end};
 }
-Status construct_gemmlowp_output_stage(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, GEMMLowpOutputStageInfo &output_stage_info)
+Status construct_gemmlowp_output_stage(const ITensorInfo       *input,
+                                       const ITensorInfo       *weights,
+                                       const ITensorInfo       *output,
+                                       GEMMLowpOutputStageInfo &output_stage_info)
 {
     const auto data_type = input->data_type();
 
-    if(is_data_type_quantized_asymmetric(data_type))
+    if (is_data_type_quantized_asymmetric(data_type))
     {
         const UniformQuantizationInfo iq_info = input->quantization_info().uniform();
         const UniformQuantizationInfo wq_info = weights->quantization_info().uniform();
@@ -78,7 +82,8 @@
         float multiplier = iq_info.scale * wq_info.scale / oq_info.scale;
         int   output_multiplier(0);
         int   output_shift(0);
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
         output_stage_info.type                = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
         output_stage_info.gemmlowp_multiplier = output_multiplier;
@@ -122,15 +127,21 @@
 
 CLGEMMDeconvolutionLayer::~CLGEMMDeconvolutionLayer() = default;
 
-Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &deconv_info)
+Status CLGEMMDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                          const ITensorInfo   *weights,
+                                          const ITensorInfo   *bias,
+                                          const ITensorInfo   *output,
+                                          const PadStrideInfo &deconv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
 
     DataLayout data_layout  = input->data_layout();
-    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
+    const bool padded_input = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 ||
+                              deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
     const bool is_nchw      = input->data_layout() == DataLayout::NCHW;
     const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
 
@@ -144,21 +155,31 @@
     TensorShape nhwc_weights_shape = weights->tensor_shape();
     TensorShape nhwc_input_shape   = input->tensor_shape();
 
-    if(is_nchw)
+    if (is_nchw)
     {
         permute(nhwc_weights_shape, PermutationVector(2, 0, 1));
         permute(nhwc_input_shape, PermutationVector(2, 0, 1));
 
-        TensorInfo nhwc_input_info = input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_input_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_input_info = input->clone()
+                                         ->set_is_resizable(true)
+                                         .reset_padding()
+                                         .set_tensor_shape(nhwc_input_shape)
+                                         .set_data_layout(DataLayout::NCHW);
 
-        TensorInfo nhwc_weights_info = weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(nhwc_weights_shape).set_data_layout(DataLayout::NCHW);
+        TensorInfo nhwc_weights_info = weights->clone()
+                                           ->set_is_resizable(true)
+                                           .reset_padding()
+                                           .set_tensor_shape(nhwc_weights_shape)
+                                           .set_data_layout(DataLayout::NCHW);
 
         CLPermute::validate(weights, &nhwc_weights_info, PermutationVector(2, 0, 1));
         CLPermute::validate(input, &nhwc_input_info, PermutationVector(2, 0, 1));
     }
 
-    const TensorShape reshaped_shape = TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
-    const TensorInfo  reshaped_info  = weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
+    const TensorShape reshaped_shape =
+        TensorShape(nhwc_weights_shape[0], nhwc_weights_shape[1] * nhwc_weights_shape[2] * nhwc_weights_shape[3]);
+    const TensorInfo reshaped_info =
+        weights->clone()->set_tensor_shape(reshaped_shape).set_data_layout(DataLayout::NCHW).set_is_resizable(true);
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(weights, &reshaped_info));
 
     TensorShape      transposed_shape(reshaped_shape[1], reshaped_shape[0]);
@@ -166,77 +187,95 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&reshaped_info, &reshaped_t_info));
 
     TensorShape gemm_output_shape(weights->dimension(idx_w) * weights->dimension(idx_h) * weights->dimension(idx_b),
-                                  input->dimension(idx_w),
-                                  input->dimension(idx_h),
-                                  input->dimension(idx_b));
+                                  input->dimension(idx_w), input->dimension(idx_h), input->dimension(idx_b));
 
     TensorInfo gemm_output_info = reshaped_t_info.clone()->set_tensor_shape(gemm_output_shape).set_is_resizable(true);
     GEMMInfo   gemm_info(false, false, true, input->dimension(idx_h), true);
 
     GEMMLowpOutputStageInfo output_stage_info;
 
-    if(is_quantized)
+    if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr, &gemm_output_info.set_data_type(DataType::S32),
-                                                                           gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(
+            &input->clone()->set_tensor_shape(nhwc_input_shape), &reshaped_t_info, nullptr,
+            &gemm_output_info.set_data_type(DataType::S32), gemm_info));
         ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(input, weights, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true), &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMM::validate(&input->clone()->set_tensor_shape(nhwc_input_shape).set_is_resizable(true),
+                             &reshaped_t_info, nullptr, &gemm_output_info, 1.0f, 0.0f, gemm_info));
     }
 
     const PadStrideInfo stride_info(deconv_info.stride().first, deconv_info.stride().second);
-    auto                out_dims           = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h), weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
-    const TensorShape   deconv_shape       = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
-    TensorInfo          col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
+    auto                out_dims = deconvolution_output_dimensions(input->dimension(idx_w), input->dimension(idx_h),
+                                                                   weights->dimension(idx_w), weights->dimension(idx_h), stride_info);
+    const TensorShape   deconv_shape =
+        misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights);
+    TensorInfo col2im_output_info = gemm_output_info.clone()->set_tensor_shape(deconv_shape).set_is_resizable(true);
 
-    if(padded_input && is_quantized)
+    if (padded_input && is_quantized)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output, start_end.first, start_end.second));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(
+            &col2im_output_info, nullptr,
+            &col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()), output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&col2im_output_info.clone()->set_is_resizable(true).set_data_type(input->data_type()),
+                              output, start_end.first, start_end.second));
     }
-    else if(padded_input)
+    else if (padded_input)
     {
         const auto start_end = compute_start_end_slice_coordinates(col2im_output_info, deconv_info, is_nchw);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
         ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&col2im_output_info, output, start_end.first, start_end.second));
     }
-    else if(is_quantized)
+    else if (is_quantized)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(
+            &gemm_output_info, bias, &col2im_output_info, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&col2im_output_info, nullptr, output, output_stage_info));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDeconvolutionReshapeOutputKernel::validate(&gemm_output_info, bias, output, input, weights, deconv_info));
     }
 
     return Status{};
 }
 
-void CLGEMMDeconvolutionLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output, const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const ICLTensor     *input,
+                                         const ICLTensor     *weights,
+                                         const ICLTensor     *bias,
+                                         ICLTensor           *output,
+                                         const PadStrideInfo &deconv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, bias, output, deconv_info);
 }
 
-void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *bias, ICLTensor *output,
-                                         const PadStrideInfo &deconv_info)
+void CLGEMMDeconvolutionLayer::configure(const CLCompileContext &compile_context,
+                                         const ICLTensor        *input,
+                                         const ICLTensor        *weights,
+                                         const ICLTensor        *bias,
+                                         ICLTensor              *output,
+                                         const PadStrideInfo    &deconv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(input->info(),
-                                                                  weights->info(),
-                                                                  bias != nullptr ? bias->info() : nullptr,
-                                                                  output->info(),
-                                                                  deconv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGEMMDeconvolutionLayer::validate(
+        input->info(), weights->info(), bias != nullptr ? bias->info() : nullptr, output->info(), deconv_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, deconv_info);
 
     _original_weights = weights;
-    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 || deconv_info.pad_top() > 0;
-    _is_nchw          = input->info()->data_layout() == DataLayout::NCHW;
-    _is_quantized     = is_data_type_quantized_asymmetric(input->info()->data_type());
+    _padded_input     = deconv_info.pad_bottom() > 0 || deconv_info.pad_left() > 0 || deconv_info.pad_right() > 0 ||
+                    deconv_info.pad_top() > 0;
+    _is_nchw      = input->info()->data_layout() == DataLayout::NCHW;
+    _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
 
     const ICLTensor *input_to_use   = input;
     const ICLTensor *weights_to_use = weights;
@@ -245,7 +284,7 @@
     // do an outer product in NCHW and then an accumulation through a reduction. This would have two
     // drawbacks: first, the outer product is less efficient than a full GEMM. Second, the reduction
     // might be slower than GEMM.
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
         _permute_input_to_nhwc.configure(compile_context, input, &_permuted_input, PermutationVector(2U, 0U, 1U));
@@ -257,10 +296,11 @@
     }
 
     // Reshape the input weights. The weights will be reshaped only once during the call to prepare()
-    _reshaped_weights.allocator()->init(TensorInfo(TensorShape(weights_to_use->info()->dimension(0),
-                                                               weights_to_use->info()->dimension(1) * weights_to_use->info()->dimension(2) * weights_to_use->info()->dimension(3)),
-                                                   1,
-                                                   input->info()->data_type(), weights->info()->quantization_info()));
+    _reshaped_weights.allocator()->init(
+        TensorInfo(TensorShape(weights_to_use->info()->dimension(0), weights_to_use->info()->dimension(1) *
+                                                                         weights_to_use->info()->dimension(2) *
+                                                                         weights_to_use->info()->dimension(3)),
+                   1, input->info()->data_type(), weights->info()->quantization_info()));
 
     _reshape_weights.configure(compile_context, weights_to_use, &_reshaped_weights);
     _transpose_weights.configure(compile_context, &_reshaped_weights, &_reshaped_weights_t);
@@ -269,15 +309,17 @@
     GEMMInfo     gemm_info(false, false, true, input->info()->dimension(idx_h), true);
 
     // Configure output stage for asymmetric quantized types
-    if(_is_quantized)
+    if (_is_quantized)
     {
         // gemmlowp adds the offsets (instead of subtracting them). Thus, we need to negate the original
         // and restore them back to make it work properly.
         QuantizationInfo iq_info = input->info()->quantization_info();
         QuantizationInfo wq_info = weights->info()->quantization_info();
 
-        input_to_use->info()->set_quantization_info(QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
-        _reshaped_weights_t.info()->set_quantization_info(QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
+        input_to_use->info()->set_quantization_info(
+            QuantizationInfo(iq_info.uniform().scale, -iq_info.uniform().offset));
+        _reshaped_weights_t.info()->set_quantization_info(
+            QuantizationInfo(wq_info.uniform().scale, -wq_info.uniform().offset));
 
         _mm_gemmlowp.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, gemm_info);
 
@@ -286,10 +328,11 @@
     }
     else
     {
-        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f, gemm_info);
+        _mm_gemm.configure(compile_context, input_to_use, &_reshaped_weights_t, nullptr, &_gemm_output, 1.f, 0.0f,
+                           gemm_info);
     }
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -298,7 +341,7 @@
     ICLTensor *slice_output          = nullptr;
     ICLTensor *output_stage_output   = nullptr;
 
-    if(_padded_input && _is_quantized)
+    if (_padded_input && _is_quantized)
     {
         _memory_group.manage(&_slice_gemm_input);
         _memory_group.manage(&_gemmlowp_final);
@@ -306,13 +349,13 @@
         output_stage_output   = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_padded_input)
+    else if (_padded_input)
     {
         _memory_group.manage(&_slice_gemm_input);
         deconv_reshape_output = &_slice_gemm_input;
         slice_output          = output;
     }
-    else if(_is_quantized)
+    else if (_is_quantized)
     {
         _memory_group.manage(&_gemmlowp_final);
         deconv_reshape_output = &_gemmlowp_final;
@@ -324,21 +367,24 @@
     }
 
     // Configure a Col2Im call to reshape the output of GEMM
-    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(), weights->info(), deconv_info);
+    _deconv_reshape->configure(compile_context, &_gemm_output, bias, deconv_reshape_output, input->info(),
+                               weights->info(), deconv_info);
     _gemm_output.allocator()->allocate();
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         GEMMLowpOutputStageInfo output_stage_info;
         construct_gemmlowp_output_stage(input->info(), weights->info(), output->info(), output_stage_info);
-        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output, output_stage_info);
+        _gemmlowp_output_stage.configure(compile_context, &_gemmlowp_final, nullptr, output_stage_output,
+                                         output_stage_info);
         _gemmlowp_final.allocator()->allocate();
     }
 
     // If the input was padded, the output needs to be sliced.
-    if(_padded_input)
+    if (_padded_input)
     {
-        const auto start_end = compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
+        const auto start_end =
+            compute_start_end_slice_coordinates(*deconv_reshape_output->info(), deconv_info, _is_nchw);
         _slice_gemm.configure(compile_context, &_slice_gemm_input, slice_output, start_end.first, start_end.second);
         _slice_gemm_input.allocator()->allocate();
     }
@@ -350,12 +396,12 @@
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_nchw)
+    if (_is_nchw)
     {
         _permute_input_to_nhwc.run();
     }
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _mm_gemmlowp.run();
     }
@@ -366,12 +412,12 @@
 
     CLScheduler::get().enqueue(*_deconv_reshape, false);
 
-    if(_is_quantized)
+    if (_is_quantized)
     {
         _gemmlowp_output_stage.run();
     }
 
-    if(_padded_input)
+    if (_padded_input)
     {
         _slice_gemm.run();
     }
@@ -379,11 +425,11 @@
 
 void CLGEMMDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->allocate();
             _permute_weights_to_nhwc.run();
@@ -392,7 +438,7 @@
         _reshaped_weights.allocator()->allocate();
         _reshape_weights.run();
 
-        if(_is_nchw)
+        if (_is_nchw)
         {
             _permuted_weights.allocator()->free();
         }
@@ -401,7 +447,7 @@
         _transpose_weights.run();
 
         // Prepare gemm
-        if(!_is_quantized)
+        if (!_is_quantized)
         {
             _mm_gemm.prepare();
         }
@@ -411,7 +457,7 @@
         }
 
         // Free resources
-        if(!_reshaped_weights_t.is_used())
+        if (!_reshaped_weights_t.is_used())
         {
             _reshaped_weights_t.allocator()->free();
         }

diff --git a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
index d902947..8bad198 100644
--- a/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpMatrixMultiplyCore.cpp

@@ -31,12 +31,12 @@
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/IMemoryManager.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h"
 
 namespace arm_compute
@@ -46,13 +46,13 @@
 
 struct CLGEMMLowpMatrixMultiplyCore::Impl
 {
-    const ICLTensor              *b{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *b{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
     ITensorPack                   run_pack{};
     MemoryRequirements            aux_mem_req{};
     WorkspaceData<CLTensor>       workspace_tensors{};
-    bool                          is_prepared{ false };
+    bool                          is_prepared{false};
 };
 
 CLGEMMLowpMatrixMultiplyCore::CLGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager)
@@ -63,12 +63,18 @@
 
 CLGEMMLowpMatrixMultiplyCore::~CLGEMMLowpMatrixMultiplyCore() = default;
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(
+    const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), a, b, c, output, gemm_info);
 }
 
-void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, const GEMMInfo &gemm_info)
+void CLGEMMLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context,
+                                             const ICLTensor        *a,
+                                             const ICLTensor        *b,
+                                             const ICLTensor        *c,
+                                             ICLTensor              *output,
+                                             const GEMMInfo         &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
@@ -76,23 +82,29 @@
     _impl->op          = std::make_unique<OperatorType>();
     _impl->is_prepared = gemm_info.retain_internal_weights();
 
-    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(), gemm_info);
+    _impl->op->configure(compile_context, a->info(), b->info(), c != nullptr ? c->info() : nullptr, output->info(),
+                         gemm_info);
     _impl->aux_mem_req = _impl->op->workspace();
 
     // Manage/allocate auxilairy tensors
-    if(_impl->is_prepared)
+    if (_impl->is_prepared)
     {
         _impl->run_pack.add_const_tensor(ACL_SRC_0, a);
         _impl->run_pack.add_tensor(ACL_DST, output);
     }
     else
     {
-        _impl->run_pack          = { { ACL_SRC_0, a }, { ACL_SRC_1, _impl->b }, { ACL_SRC_2, c }, { ACL_DST, output } };
-        _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+        _impl->run_pack = {{ACL_SRC_0, a}, {ACL_SRC_1, _impl->b}, {ACL_SRC_2, c}, {ACL_DST, output}};
+        _impl->workspace_tensors =
+            manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
     }
 }
 
-Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status CLGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     return OperatorType::validate(a, b, c, output, gemm_info);
 }
@@ -108,7 +120,7 @@
 
 void CLGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 

diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index 6feed0d..3dd8c5f 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp

@@ -40,27 +40,33 @@
 {
 struct CLGEMMLowpOutputStage::Impl
 {
-    const ICLTensor                               *src{ nullptr };
-    const ICLTensor                               *bias{ nullptr };
-    ICLTensor                                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClGemmLowpOutputStage> op{ nullptr };
+    const ICLTensor                               *src{nullptr};
+    const ICLTensor                               *bias{nullptr};
+    ICLTensor                                     *dst{nullptr};
+    std::unique_ptr<opencl::ClGemmLowpOutputStage> op{nullptr};
     ITensorPack                                    run_pack{};
 };
 
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage()
-    : _impl(std::make_unique<Impl>())
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
 {
 }
-CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&) = default;
+CLGEMMLowpOutputStage::CLGEMMLowpOutputStage(CLGEMMLowpOutputStage &&)            = default;
 CLGEMMLowpOutputStage &CLGEMMLowpOutputStage::operator=(CLGEMMLowpOutputStage &&) = default;
 CLGEMMLowpOutputStage::~CLGEMMLowpOutputStage()                                   = default;
 
-void CLGEMMLowpOutputStage::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, bias, output, info);
 }
 
-void CLGEMMLowpOutputStage::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, const GEMMLowpOutputStageInfo &info)
+void CLGEMMLowpOutputStage::configure(const CLCompileContext        &compile_context,
+                                      const ICLTensor               *input,
+                                      const ICLTensor               *bias,
+                                      ICLTensor                     *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -69,11 +75,15 @@
     _impl->dst  = output;
 
     _impl->op = std::make_unique<opencl::ClGemmLowpOutputStage>();
-    _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info);
-    _impl->run_pack = { { ACL_SRC, _impl->src }, { ACL_BIAS, _impl->bias }, { ACL_DST, _impl->dst } };
+    _impl->op->configure(compile_context, input->info(), bias != nullptr ? bias->info() : nullptr, output->info(),
+                         info);
+    _impl->run_pack = {{ACL_SRC, _impl->src}, {ACL_BIAS, _impl->bias}, {ACL_DST, _impl->dst}};
 }
 
-Status CLGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status CLGEMMLowpOutputStage::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *output,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     return opencl::ClGemmLowpOutputStage::validate(input, bias, output, info);
 }

diff --git a/src/runtime/CL/functions/CLGather.cpp b/src/runtime/CL/functions/CLGather.cpp
index 033c117..2610cb1 100644
--- a/src/runtime/CL/functions/CLGather.cpp
+++ b/src/runtime/CL/functions/CLGather.cpp

@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLGather.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/core/CL/kernels/CLGatherKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLGatherKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,11 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, axis);
 }
 
-void CLGather::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *indices, ICLTensor *output, int axis)
+void CLGather::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *input,
+                         const ICLTensor        *indices,
+                         ICLTensor              *output,
+                         int                     axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, axis);
     auto k = std::make_unique<CLGatherKernel>();

diff --git a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
index 9cb7d61..b2c1d26 100644
--- a/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp
+++ b/src/runtime/CL/functions/CLGenerateProposalsLayer.cpp

@@ -27,13 +27,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/functions/CLDequantizationLayer.h"
 #include "arm_compute/runtime/CL/functions/CLQuantizationLayer.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLBoundingBoxTransformKernel.h"
 #include "src/core/CL/kernels/CLGenerateProposalsLayerKernel.h"
 #include "src/core/CL/kernels/CLPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLGenerateProposalsLayer::CLGenerateProposalsLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -71,48 +71,67 @@
 
 CLGenerateProposalsLayer::~CLGenerateProposalsLayer() = default;
 
-void CLGenerateProposalsLayer::configure(const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals, ICLTensor *scores_out, ICLTensor *num_valid_proposals,
+void CLGenerateProposalsLayer::configure(const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
+    configure(CLKernelLibrary::get().get_compile_context(), scores, deltas, anchors, proposals, scores_out,
+              num_valid_proposals, info);
 }
 
-void CLGenerateProposalsLayer::configure(const CLCompileContext &compile_context, const ICLTensor *scores, const ICLTensor *deltas, const ICLTensor *anchors, ICLTensor *proposals,
-                                         ICLTensor *scores_out,
-                                         ICLTensor *num_valid_proposals, const GenerateProposalsInfo &info)
+void CLGenerateProposalsLayer::configure(const CLCompileContext      &compile_context,
+                                         const ICLTensor             *scores,
+                                         const ICLTensor             *deltas,
+                                         const ICLTensor             *anchors,
+                                         ICLTensor                   *proposals,
+                                         ICLTensor                   *scores_out,
+                                         ICLTensor                   *num_valid_proposals,
+                                         const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+                                                                  proposals->info(), scores_out->info(),
+                                                                  num_valid_proposals->info(), info));
     ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType scores_data_type = scores->info()->data_type();
     _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
-    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
-    const int    pre_nms_topN       = info.pre_nms_topN();
-    const int    post_nms_topN      = info.post_nms_topN();
-    const size_t values_per_roi     = info.values_per_roi();
+    const int num_anchors           = scores->info()->dimension(
+                  get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN      = info.pre_nms_topN();
+    const int    post_nms_topN     = info.post_nms_topN();
+    const size_t values_per_roi    = info.values_per_roi();
 
     const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
     const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
-    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+    const QuantizationInfo rois_qinfo =
+        (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
-    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors_kernel->configure(compile_context, anchors, &_all_anchors,
+                                       ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+    _deltas_flattened.allocator()->init(
+        TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
     _memory_group.manage(&_deltas_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas.configure(compile_context, deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
         _flatten_deltas.configure(compile_context, &_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -126,10 +145,10 @@
 
     // Permute and reshape scores
     _memory_group.manage(&_scores_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores.configure(compile_context, scores, &_scores_permuted, PermutationVector{2, 0, 1});
         _flatten_scores.configure(compile_context, &_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -140,7 +159,7 @@
 
     CLTensor *anchors_to_use = &_all_anchors;
     CLTensor *deltas_to_use  = &_deltas_flattened;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
         _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -163,11 +182,12 @@
     anchors_to_use->allocator()->allocate();
 
     _all_proposals_to_use = &_all_proposals;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
-        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _all_proposals_quantized.allocator()->init(
+            TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
         _quantize_all_proposals->configure(compile_context, &_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
@@ -183,7 +203,8 @@
 
     // Note that NMS needs outputs preinitialized.
     auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+                       rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
@@ -195,20 +216,27 @@
     _num_valid_proposals = num_valid_proposals;
 
     _memory_group.manage(&_proposals_4_roi_values);
-    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values, &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
-                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height()));
+    _cpp_nms.configure(&_scores_flattened, _all_proposals_to_use, nullptr, scores_out, &_proposals_4_roi_values,
+                       &_classes_nms_unused, nullptr, &_keeps_nms_unused, num_valid_proposals,
+                       BoxNMSLimitInfo(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+                                       true, min_size_scaled, info.im_width(), info.im_height()));
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
     _all_proposals_to_use->allocator()->allocate();
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad_kernel->configure(compile_context, &_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
     _proposals_4_roi_values.allocator()->allocate();
 }
 
-Status CLGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
-                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status CLGenerateProposalsLayer::validate(const ITensorInfo           *scores,
+                                          const ITensorInfo           *deltas,
+                                          const ITensorInfo           *anchors,
+                                          const ITensorInfo           *proposals,
+                                          const ITensorInfo           *scores_out,
+                                          const ITensorInfo           *num_valid_proposals,
+                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -216,9 +244,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
-    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
-    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
-    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_anchors =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -227,76 +258,101 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
-    if(is_qasymm8)
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
         const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
     }
 
-    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+    TensorInfo all_anchors_info(
+        anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLComputeAllAnchorsKernel::validate(
+        anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
 
-    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    if(scores->data_layout() == DataLayout::NHWC)
+    TensorInfo deltas_permuted_info =
+        deltas->clone()
+            ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+            .set_is_resizable(true);
+    TensorInfo scores_permuted_info =
+        scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if (scores->data_layout() == DataLayout::NHWC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
     }
 
-    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo deltas_flattened_info(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
-    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(
+        scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
-    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
-    if(is_qasymm8)
+    TensorInfo  proposals_4_roi_values_quantized(
+         deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+        .set_quantization_info(QuantizationInfo(0.125f, 0));
+    if (is_qasymm8)
     {
-        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        TensorInfo all_anchors_f32_info(anchors->clone()
+                                            ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                            .set_is_resizable(true)
+                                            .set_data_type(DataType::F32));
         ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
 
-        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+        TensorInfo deltas_flattened_f32_info(deltas->clone()
+                                                 ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                 .set_is_resizable(true)
+                                                 .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
 
-        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()
+                                                  ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                  .set_is_resizable(true)
+                                                  .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(
+            &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+            BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                           BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLBoundingBoxTransformKernel::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                                   BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLPadLayerKernel::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
 
-    if(num_valid_proposals->total_size() > 0)
+    if (num_valid_proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
     }
 
-    if(proposals->total_size() > 0)
+    if (proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        if(is_qasymm8)
+        if (is_qasymm8)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
             const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -309,7 +365,7 @@
         }
     }
 
-    if(scores_out->total_size() > 0)
+    if (scores_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -356,7 +412,7 @@
     CLScheduler::get().enqueue(*_compute_anchors_kernel, false);
 
     // Transpose and reshape the inputs
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _permute_deltas.run();
         _permute_scores.run();
@@ -364,7 +420,7 @@
     _flatten_deltas.run();
     _flatten_scores.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _dequantize_anchors->run();
         _dequantize_deltas->run();
@@ -373,7 +429,7 @@
     // Build the boxes
     CLScheduler::get().enqueue(*_bounding_box_kernel, false);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _quantize_all_proposals->run();
     }

diff --git a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
index 90af36a..1a2369c 100644
--- a/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLIndirectConvolutionLayer.cpp

@@ -26,36 +26,45 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
-#include "src/gpu/cl/operators/ClIndirectConv2d.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/operators/ClIndirectConv2d.h"
 
 namespace arm_compute
 {
 struct CLIndirectConvolutionLayer::Impl
 {
-    const ICLTensor                          *src{ nullptr };
-    const ICLTensor                          *weights{ nullptr };
-    const ICLTensor                          *biases{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClIndirectConv2d> op{ nullptr };
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClIndirectConv2d> op{nullptr};
 };
 
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer()
-    : _impl(std::make_unique<Impl>())
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&) = default;
+CLIndirectConvolutionLayer::CLIndirectConvolutionLayer(CLIndirectConvolutionLayer &&)            = default;
 CLIndirectConvolutionLayer &CLIndirectConvolutionLayer::operator=(CLIndirectConvolutionLayer &&) = default;
 CLIndirectConvolutionLayer::~CLIndirectConvolutionLayer()                                        = default;
 
-void CLIndirectConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info);
 }
 
-void CLIndirectConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
-                                           const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void CLIndirectConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info);
@@ -65,10 +74,15 @@
     _impl->biases  = biases;
     _impl->dst     = output;
     _impl->op      = std::make_unique<opencl::ClIndirectConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info, act_info);
 }
 
-Status CLIndirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status CLIndirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
                                             const ActivationLayerInfo &act_info)
 {
     return opencl::ClIndirectConv2d::validate(input, weights, biases, output, conv_info, act_info);
@@ -83,4 +97,4 @@
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-}
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
index 5feafe1..0e994e1 100644
--- a/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLInstanceNormalizationLayer.cpp

@@ -27,50 +27,62 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLHelpers.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLInstanceNormalizationLayerKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLInstanceNormalizationLayer::CLInstanceNormalizationLayer(CLRuntimeContext *ctx) // NOLINT
-    : _inst_norm_kernel(),
-      _mean_var_kernel(),
-      _mean_var_tensor(),
-      _ctx(ctx)
+    : _inst_norm_kernel(), _mean_var_kernel(), _mean_var_tensor(), _ctx(ctx)
 {
 }
 CLInstanceNormalizationLayer::~CLInstanceNormalizationLayer()
 {
 }
 
-void CLInstanceNormalizationLayer::configure(ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(
+    ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, gamma, beta, epsilon, use_mixed_precision);
 }
 
-void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+void CLInstanceNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                             ICLTensor              *input,
+                                             ICLTensor              *output,
+                                             float                   gamma,
+                                             float                   beta,
+                                             float                   epsilon,
+                                             bool                    use_mixed_precision)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon, use_mixed_precision);
     auto w = std::make_unique<CLComputeMeanVariance>();
     w->configure(compile_context, input, &_mean_var_tensor, use_mixed_precision);
     _mean_var_kernel = std::move(w);
     auto k           = std::make_unique<CLInstanceNormalizationLayerKernel>();
-    k->configure(compile_context, input, &_mean_var_tensor, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    k->configure(compile_context, input, &_mean_var_tensor, output,
+                 InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
     _inst_norm_kernel = std::move(k);
     _mean_var_tensor.allocator()->allocate();
 }
 
-Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon, bool use_mixed_precision)
+Status CLInstanceNormalizationLayer::validate(const ITensorInfo *input,
+                                              const ITensorInfo *output,
+                                              float              gamma,
+                                              float              beta,
+                                              float              epsilon,
+                                              bool               use_mixed_precision)
 {
-    return CLInstanceNormalizationLayerKernel::validate(input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
+    return CLInstanceNormalizationLayerKernel::validate(
+        input, output, InstanceNormalizationLayerKernelInfo(gamma, beta, epsilon, use_mixed_precision));
 }
 
 void CLInstanceNormalizationLayer::run()
 {
-    ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel, "The child class didn't set the CL kernel or function isn't configured");
+    ARM_COMPUTE_ERROR_ON_MSG(!_inst_norm_kernel,
+                             "The child class didn't set the CL kernel or function isn't configured");
     schedule_kernel_on_ctx(_ctx, _mean_var_kernel.get());
     schedule_kernel_on_ctx(_ctx, _inst_norm_kernel.get());
 }

diff --git a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
index 1278385..4fe1d9b 100644
--- a/src/runtime/CL/functions/CLL2NormalizeLayer.cpp
+++ b/src/runtime/CL/functions/CLL2NormalizeLayer.cpp

@@ -29,12 +29,12 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLL2NormalizeLayerKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
@@ -57,7 +57,8 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, epsilon);
 }
 
-void CLL2NormalizeLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
+void CLL2NormalizeLayer::configure(
+    const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int axis, float epsilon)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, epsilon);
 
@@ -86,7 +87,8 @@
     sum_sq.set_tensor_shape(shape);
 
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
     shape.set(actual_axis, 1);

diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index ea08bec..3b50234 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp

@@ -24,15 +24,15 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayer.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/gpu/cl/kernels/ClTransposeKernel.h"
 
 namespace arm_compute
 {
@@ -40,54 +40,155 @@
 using namespace arm_compute::utils::info_helpers;
 
 CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(),
-      _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(),
-      _pixelwise_mul_cell_state2(), _fully_connected_output(), _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(),
-      _fully_connected_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(),
-      _concat_weights_input_gate(), _concat_weights_output(), _ones_fill(), _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(),
-      _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(), _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(),
-      _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(),
-      _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(),
-      _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(), _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(),
-      _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(), _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false),
-      _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false), _is_layer_norm_lstm(false)
+    : _memory_group(std::move(memory_manager)),
+      _fully_connected_input_gate(),
+      _accum_input_gate1(),
+      _subtract_input_gate(),
+      _pixelwise_mul_input_gate(),
+      _activation_input_gate(),
+      _fully_connected_forget_gate(),
+      _accum_forget_gate1(),
+      _pixelwise_mul_forget_gate(),
+      _activation_forget_gate(),
+      _fully_connected_cell_state(),
+      _gemm_cell_state1(),
+      _transpose_cell_state(std::make_unique<opencl::kernels::ClTransposeKernel>()),
+      _accum_cell_state1(),
+      _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(),
+      _activation_cell_state(),
+      _cell_clip(),
+      _pixelwise_mul_cell_state2(),
+      _fully_connected_output(),
+      _pixelwise_mul_output_state1(),
+      _accum_output1(),
+      _activation_output(),
+      _activation_output_state(),
+      _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(),
+      _projection_clip(),
+      _copy_cell_state(),
+      _copy_output(),
+      _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(),
+      _concat_weights_forget_gate(),
+      _concat_weights_input_gate(),
+      _concat_weights_output(),
+      _ones_fill(),
+      _mean_std_norm_input_gate(),
+      _pixelwise_mul_input_gate_coeff(),
+      _accum_input_gate_bias(),
+      _mean_std_norm_forget_gate(),
+      _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(),
+      _pixelwise_mul_cell_gate_coeff(),
+      _accum_cell_gate_bias(),
+      _mean_std_norm_output_gate(),
+      _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(),
+      _input_gate_out1(),
+      _input_gate_out2(),
+      _input_gate_out3(),
+      _input_gate_out4(),
+      _forget_gate_out1(),
+      _forget_gate_out2(),
+      _forget_gate_out3(),
+      _forget_gate_out4(),
+      _forget_gate_out5(),
+      _forget_gate_out6(),
+      _cell_state_out1(),
+      _cell_state_out2(),
+      _cell_state_out3(),
+      _cell_state_out4(),
+      _cell_state_out5(),
+      _output1(),
+      _output2(),
+      _output3(),
+      _output4(),
+      _cell_state_activation(),
+      _output_state1(),
+      _ones(),
+      _input_layer_norm_out1(),
+      _input_layer_norm_out2(),
+      _forget_layer_norm_out1(),
+      _forget_layer_norm_out2(),
+      _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(),
+      _output_layer_norm_out1(),
+      _output_layer_norm_out2(),
+      _run_peephole_opt(false),
+      _run_cifg_opt(false),
+      _perform_cell_clipping(false),
+      _has_projection_weights(false),
+      _perform_projection_clipping(false),
+      _is_prepared(false),
+      _is_layer_norm_lstm(false)
 {
 }
 
 CLLSTMLayer::~CLLSTMLayer() = default;
 
-void CLLSTMLayer::configure(const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in,
+              cell_state_in, scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
               cell_threshold, projection_threshold);
 }
 
-void CLLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                            const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                            const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                            const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                            const ICLTensor *output_state_in, ICLTensor *cell_state_in,
-                            ICLTensor *scratch_buffer, ICLTensor *output_state_out, ICLTensor *cell_state_out, ICLTensor *output,
-                            const LSTMParams<ICLTensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void CLLSTMLayer::configure(const CLCompileContext      &compile_context,
+                            const ICLTensor             *input,
+                            const ICLTensor             *input_to_forget_weights,
+                            const ICLTensor             *input_to_cell_weights,
+                            const ICLTensor             *input_to_output_weights,
+                            const ICLTensor             *recurrent_to_forget_weights,
+                            const ICLTensor             *recurrent_to_cell_weights,
+                            const ICLTensor             *recurrent_to_output_weights,
+                            const ICLTensor             *forget_gate_bias,
+                            const ICLTensor             *cell_bias,
+                            const ICLTensor             *output_gate_bias,
+                            const ICLTensor             *output_state_in,
+                            ICLTensor                   *cell_state_in,
+                            ICLTensor                   *scratch_buffer,
+                            ICLTensor                   *output_state_out,
+                            ICLTensor                   *cell_state_out,
+                            ICLTensor                   *output,
+                            const LSTMParams<ICLTensor> &lstm_params,
+                            const ActivationLayerInfo   &activation_info,
+                            float                        cell_threshold,
+                            float                        projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
-                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias,
-                                 output_state_in, cell_state_in,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
 
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                           recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out,
-                           output, lstm_params, activation_info, cell_threshold, projection_threshold);
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+                           forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+                           scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+                           cell_threshold, projection_threshold);
 
     _is_layer_norm_lstm = lstm_params.use_layer_norm();
 
@@ -96,13 +197,12 @@
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(input->info(), input_to_forget_weights->info(),
-                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state_in->info(), cell_state_in->info(),
-                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
-                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+        cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+        lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
     // Configure block that calculates the forget gate
@@ -126,26 +226,31 @@
 
     weights_vector.emplace_back(input_to_forget_weights);
     weights_vector.emplace_back(recurrent_to_forget_weights);
-    const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
+    const TensorShape weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
     _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
 
     _concat_weights_forget_gate.configure(compile_context, weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(compile_context, &_forget_gate_out2, &_forget_gate_out6,
+                                           (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
 
     CLTensor *forget_gate_out = &_forget_gate_out5;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                             &_forget_gate_out4, 1, ConvertPolicy::SATURATE,
+                                             RoundingPolicy::TO_NEAREST_EVEN);
+        _accum_forget_gate1.configure(compile_context, &_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+                                      ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -154,22 +259,25 @@
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
         _mean_std_norm_forget_gate.configure(compile_context, forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_forget_gate_coeff.configure(compile_context, forget_gate_out,
+                                                   lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(compile_context, &_forget_layer_norm_out1, forget_gate_bias,
+                                          &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(compile_context, forget_gate_out, nullptr,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -178,12 +286,13 @@
     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     CLTensor *input_gate_out = &_input_gate_out1;
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _ones_fill.configure(compile_context, &_ones, PixelValue(1, _ones.info()->data_type()));
-        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
+        _subtract_input_gate.configure(compile_context, &_ones, forget_gate_out, &_input_gate_out1,
+                                       ConvertPolicy::SATURATE);
         _ones.allocator()->allocate();
         _run_cifg_opt = true;
     }
@@ -195,7 +304,8 @@
         std::vector<const ICLTensor *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
         _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
 
         _concat_weights_input_gate.configure(compile_context, lstm_weights, &_input_gate_out2, Window::DimX);
@@ -203,15 +313,20 @@
         _memory_group.manage(&_input_gate_out1);
 
         _memory_group.manage(&_input_gate_out3);
-        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(compile_context, &_forget_gate_out2, &_input_gate_out2,
+                                              (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+                                              &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
 
         input_gate_out = &_input_gate_out3;
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                &_input_gate_out4, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_NEAREST_EVEN);
+            _accum_input_gate1.configure(compile_context, &_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+                                         ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -221,22 +336,25 @@
             _input_gate_out1.allocator()->allocate();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
             _mean_std_norm_input_gate.configure(compile_context, input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                      RoundingPolicy::TO_NEAREST_EVEN);
+            _pixelwise_mul_input_gate_coeff.configure(compile_context, input_gate_out,
+                                                      lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1,
+                                                      1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(compile_context, &_input_layer_norm_out1, lstm_params.input_gate_bias(),
+                                             &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(compile_context, input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(compile_context, input_gate_out, nullptr,
+                                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -249,44 +367,54 @@
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(compile_context, input, input_to_cell_weights,
+                                          (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
     _transpose_cell_state->configure(compile_context, recurrent_to_cell_weights->info(), _cell_state_out2.info());
     _recurrent_to_cell_weights = recurrent_to_cell_weights;
     _memory_group.manage(&_cell_state_out3);
-    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f, 0.f);
+    _gemm_cell_state1.configure(compile_context, output_state_in, &_cell_state_out2, nullptr, &_cell_state_out3, 1.f,
+                                0.f);
     _cell_state_out2.allocator()->allocate();
     _memory_group.manage(&_cell_state_out4);
-    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
+    _accum_cell_state1.configure(compile_context, &_cell_state_out1, &_cell_state_out3, &_cell_state_out4,
+                                 ConvertPolicy::SATURATE);
     CLTensor *cell_state_out_ptr = &_cell_state_out4;
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
         _mean_std_norm_cell_gate.configure(compile_context, cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                 RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_cell_gate_coeff.configure(compile_context, cell_state_out_ptr,
+                                                 lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1,
+                                                 ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(compile_context, &_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+                                        ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
     _activation_cell_state.configure(compile_context, cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_cell_state1.configure(compile_context, cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
-    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
+    _pixelwise_mul_cell_state2.configure(compile_context, forget_gate_out, cell_state_in, &_cell_state_out3, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _accum_cell_state2.configure(compile_context, &_cell_state_out5, &_cell_state_out3, &_cell_state_out1,
+                                 ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
-    if(cell_threshold != 0.f)
+    if (cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
+        _cell_clip.configure(compile_context, &_cell_state_out1, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 cell_threshold, -cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -298,7 +426,8 @@
     std::vector<const ICLTensor *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
     _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
 
     _concat_weights_output.configure(compile_context, in_out_weights, &_output2, Window::DimX);
@@ -306,18 +435,20 @@
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(compile_context, &_forget_gate_out2, &_output2,
+                                      (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
 
     CLTensor *output_gate_out = &_output4;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_state1.configure(compile_context, &_cell_state_out1, lstm_params.cell_to_output_weights(),
+                                               &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         _accum_output1.configure(compile_context, &_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
@@ -329,22 +460,25 @@
     {
         _output1.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
         _mean_std_norm_output_gate.configure(compile_context, output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
-                                                   RoundingPolicy::TO_NEAREST_EVEN);
+        _pixelwise_mul_output_gate_coeff.configure(compile_context, output_gate_out,
+                                                   lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1,
+                                                   ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(compile_context, &_output_layer_norm_out1, output_gate_bias,
+                                          &_output_layer_norm_out2, ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(compile_context, output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(compile_context, output_gate_out, nullptr,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -361,19 +495,24 @@
 
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(compile_context, &_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN);
+    _pixelwise_mul_output_state2.configure(compile_context, &_cell_state_activation, output_gate_out,
+                                           output_state_out_tmp, 1, ConvertPolicy::SATURATE,
+                                           RoundingPolicy::TO_NEAREST_EVEN);
     _cell_state_activation.allocator()->allocate();
 
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(compile_context, output_state_out_tmp, lstm_params.projection_weights(),
+                                                lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
-        if(projection_threshold != 0.f)
+        if (projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -projection_threshold, projection_threshold));
         }
     }
 
@@ -383,7 +522,7 @@
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<const ICLTensor *> scratch_inputs;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
     }
@@ -397,29 +536,38 @@
     output_gate_out->allocator()->allocate();
 }
 
-Status CLLSTMLayer::validate(const ITensorInfo *input,
-                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status CLLSTMLayer::validate(const ITensorInfo             *input,
+                             const ITensorInfo             *input_to_forget_weights,
+                             const ITensorInfo             *input_to_cell_weights,
+                             const ITensorInfo             *input_to_output_weights,
+                             const ITensorInfo             *recurrent_to_forget_weights,
+                             const ITensorInfo             *recurrent_to_cell_weights,
+                             const ITensorInfo             *recurrent_to_output_weights,
+                             const ITensorInfo             *forget_gate_bias,
+                             const ITensorInfo             *cell_bias,
+                             const ITensorInfo             *output_gate_bias,
+                             const ITensorInfo             *output_state_in,
+                             const ITensorInfo             *cell_state_in,
+                             const ITensorInfo             *scratch_buffer,
+                             const ITensorInfo             *output_state_out,
+                             const ITensorInfo             *cell_state_out,
+                             const ITensorInfo             *output,
+                             const LSTMParams<ITensorInfo> &lstm_params,
+                             const ActivationLayerInfo     &activation_info,
+                             float                          cell_threshold,
+                             float                          projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
-                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias,
-                                        output_state_in, cell_state_in,
-                                        scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
-                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                                       forget_gate_bias, cell_bias, output_gate_bias,
-                                                       output_state_in, cell_state_in,
-                                                       scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check dimensions
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -438,16 +586,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
-                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+                                cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
 
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         // If CIFG is used, input layer normalization weights tensor is omitted
-        if(lstm_params.has_cifg_opt())
+        if (lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
         }
@@ -459,8 +607,12 @@
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
         }
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+                                            lstm_params.cell_layer_norm_weights(),
+                                            lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+                                                           lstm_params.cell_layer_norm_weights(),
+                                                           lstm_params.output_layer_norm_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -470,7 +622,7 @@
     }
 
     // Check peephole optimization
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -488,36 +640,42 @@
     TensorInfo cell_state_tmp  = TensorInfo(TensorShape(num_cells, num_batches), 1, input->data_type());
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
-                                            lstm_params.recurrent_to_input_weights(),
-                                            lstm_params.input_gate_bias());
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -525,88 +683,121 @@
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
-        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+            input, lstm_params.input_to_input_weights(),
+            (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
-        if(lstm_params.use_layer_norm())
+        if (lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+                                                                       &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(lstm_params.use_layer_norm())
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(cell_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold,
-                                                                                                              -cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(&cell_state_tmp, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            cell_threshold, -cell_threshold)));
     }
 
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
-    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
     // Validate output gate tmp
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(
+        input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_NEAREST_EVEN));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            &output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_NEAREST_EVEN));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                    1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_NEAREST_EVEN));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
-        if(projection_threshold != 0.f)
+        ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+                                                                    lstm_params.projection_bias(), output_state_out));
+        if (projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, output_state_out,
-                                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, output_state_out,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                    projection_threshold)));
         }
     }
 
@@ -616,7 +807,7 @@
 
     // Validate scratch concatenation
     std::vector<const ITensorInfo *> inputs_vector_info_raw;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }
@@ -638,12 +829,12 @@
 
     _fully_connected_forget_gate.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
         _pixelwise_mul_forget_gate_coeff.run();
@@ -651,7 +842,7 @@
     }
     _activation_forget_gate.run();
 
-    if(_run_cifg_opt)
+    if (_run_cifg_opt)
     {
         _ones_fill.run();
         _subtract_input_gate.run();
@@ -660,13 +851,13 @@
     {
         _fully_connected_input_gate.run();
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
             _pixelwise_mul_input_gate_coeff.run();
@@ -679,12 +870,10 @@
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC, _recurrent_to_cell_weights);
     pack.add_tensor(TensorType::ACL_DST, &_cell_state_out2);
-    CLScheduler::get().enqueue_op(*_transpose_cell_state,
-                                  pack,
-                                  false);
+    CLScheduler::get().enqueue_op(*_transpose_cell_state, pack, false);
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
         _pixelwise_mul_cell_gate_coeff.run();
@@ -695,19 +884,19 @@
     _pixelwise_mul_cell_state2.run();
     _accum_cell_state2.run();
 
-    if(_perform_cell_clipping)
+    if (_perform_cell_clipping)
     {
         _cell_clip.run();
     }
 
     _fully_connected_output.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
         _pixelwise_mul_output_gate_coeff.run();
@@ -718,10 +907,10 @@
     _activation_output_state.run();
     _pixelwise_mul_output_state2.run();
 
-    if(_has_projection_weights)
+    if (_has_projection_weights)
     {
         _fully_connected_output_state.run();
-        if(_perform_projection_clipping)
+        if (_perform_projection_clipping)
         {
             _projection_clip.run();
         }
@@ -735,10 +924,10 @@
 
 void CLLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _concat_weights_forget_gate.run();
-        if(!_run_cifg_opt)
+        if (!_run_cifg_opt)
         {
             _concat_weights_input_gate.run();
         }

diff --git a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
index d14c610..ea64eda 100644
--- a/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayerQuantized.cpp

@@ -25,12 +25,12 @@
 #include "arm_compute/runtime/CL/functions/CLLSTMLayerQuantized.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
+#include "arm_compute/core/Validate.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <memory>
 
@@ -46,48 +46,129 @@
 } // namespace
 
 CLLSTMLayerQuantized::CLLSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
-      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add_cell_state_tmps(), _add2(), _mul_forget_gate_cell_state(),
-      _mul_input_gate_input_mod_gate(), _mul_output_state_tmp_output_gate(), _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(),
-      _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr), _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr),
-      _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr), _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr),
-      _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(), _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(),
-      _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(), _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state_tmp1(), _cell_state_tmp2(),
-      _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(), _is_prepared(false)
+    : _memory_group(std::move(memory_manager)),
+      _gemmlowp(),
+      _output_stage(),
+      _transpose_weights(),
+      _concat_input_weights(),
+      _concat_recurrent_weights(),
+      _concat_weights(),
+      _concat_inputs(),
+      _concat_bias(),
+      _sigmoid_forget_gate(),
+      _sigmoid_input_gate(),
+      _sigmoid_output_gate(),
+      _tanh_modulation_gate(),
+      _tanh_output_state(),
+      _add_cell_state_tmps(),
+      _add2(),
+      _mul_forget_gate_cell_state(),
+      _mul_input_gate_input_mod_gate(),
+      _mul_output_state_tmp_output_gate(),
+      _slice_input_tensor(),
+      _slice_forget_tensor(),
+      _slice_cell_tensor(),
+      _slice_output_tensor(),
+      _dequantize(),
+      _quantize(),
+      _input_to_input_weights(nullptr),
+      _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr),
+      _input_to_output_weights(nullptr),
+      _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr),
+      _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr),
+      _input_gate_bias(nullptr),
+      _forget_gate_bias(nullptr),
+      _cell_bias(nullptr),
+      _output_gate_bias(nullptr),
+      _recurrent_weights(),
+      _input_weights(),
+      _weights(),
+      _input(),
+      _weights_transposed(),
+      _output_highp(),
+      _output_lowp(),
+      _bias(),
+      _forget_gate_input(),
+      _input_gate_input(),
+      _output_gate_input(),
+      _input_modulation_gate_input(),
+      _forget_gate_output(),
+      _input_gate_output(),
+      _output_gate_output(),
+      _input_modulation_gate_output(),
+      _cell_state_tmp1(),
+      _cell_state_tmp2(),
+      _output_state_tmp(),
+      _output_state_out_symm(),
+      _output_state_out_f32(),
+      _is_prepared(false)
 {
 }
 
 void CLLSTMLayerQuantized::configure(const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+                                     const ICLTensor *input_to_input_weights,
+                                     const ICLTensor *input_to_forget_weights,
+                                     const ICLTensor *input_to_cell_weights,
+                                     const ICLTensor *input_to_output_weights,
+                                     const ICLTensor *recurrent_to_input_weights,
+                                     const ICLTensor *recurrent_to_forget_weights,
+                                     const ICLTensor *recurrent_to_cell_weights,
+                                     const ICLTensor *recurrent_to_output_weights,
+                                     const ICLTensor *input_gate_bias,
+                                     const ICLTensor *forget_gate_bias,
+                                     const ICLTensor *cell_bias,
+                                     const ICLTensor *output_gate_bias,
+                                     ICLTensor       *cell_state_in,
+                                     const ICLTensor *output_state_in,
+                                     ICLTensor       *cell_state_out,
+                                     ICLTensor       *output_state_out)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
-              output_state_out);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_input_weights, input_to_forget_weights,
+              input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+              recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias,
+              output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
 }
 
-void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                                     const ICLTensor *input_to_input_weights, const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                                     const ICLTensor *recurrent_to_input_weights, const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                                     const ICLTensor *input_gate_bias, const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                                     ICLTensor *cell_state_in, const ICLTensor *output_state_in,
-                                     ICLTensor *cell_state_out, ICLTensor *output_state_out)
+void CLLSTMLayerQuantized::configure(const CLCompileContext &compile_context,
+                                     const ICLTensor        *input,
+                                     const ICLTensor        *input_to_input_weights,
+                                     const ICLTensor        *input_to_forget_weights,
+                                     const ICLTensor        *input_to_cell_weights,
+                                     const ICLTensor        *input_to_output_weights,
+                                     const ICLTensor        *recurrent_to_input_weights,
+                                     const ICLTensor        *recurrent_to_forget_weights,
+                                     const ICLTensor        *recurrent_to_cell_weights,
+                                     const ICLTensor        *recurrent_to_output_weights,
+                                     const ICLTensor        *input_gate_bias,
+                                     const ICLTensor        *forget_gate_bias,
+                                     const ICLTensor        *cell_bias,
+                                     const ICLTensor        *output_gate_bias,
+                                     ICLTensor              *cell_state_in,
+                                     const ICLTensor        *output_state_in,
+                                     ICLTensor              *cell_state_out,
+                                     ICLTensor              *output_state_out)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                                 input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                                 recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
 
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                           input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                           recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+                           cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
                            output_state_out);
 
-    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
-                                                              input_to_output_weights->info(),
-                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(CLLSTMLayerQuantized::validate(
+        input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+        input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+        recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info()));
 
     const int input_size  = input->info()->dimension(0);
     const int batch_size  = input->info()->dimension(1);
@@ -95,8 +176,10 @@
 
     const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
 
-    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
-    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+    auto_init_if_empty(*cell_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
 
     _input_to_input_weights      = input_to_input_weights;
     _input_to_forget_weights     = input_to_forget_weights;
@@ -124,17 +207,20 @@
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
 
-    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _input_weights.allocator()->init(
+        TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_input_weights.configure(compile_context, inputs_weights_vector, &_input_weights, Window::DimY);
 
-    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _recurrent_weights.allocator()->init(
+        TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_recurrent_weights.configure(compile_context, recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
     std::vector<const ICLTensor *> weights_vector;
     weights_vector.emplace_back(&_recurrent_weights);
     weights_vector.emplace_back(&_input_weights);
 
-    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _weights.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_weights.configure(compile_context, weights_vector, &_weights, Window::DimX);
     _transpose_weights.configure(compile_context, &_weights, &_weights_transposed);
 
@@ -144,7 +230,8 @@
     input_vector.emplace_back(output_state_in);
 
     _memory_group.manage(&_input);
-    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _input.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
     _concat_inputs.configure(compile_context, input_vector, &_input, Window::DimX);
 
     // Bias concatenation
@@ -159,7 +246,8 @@
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
 
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
@@ -169,7 +257,8 @@
 
     // Set the offset back
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
     // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -191,85 +280,111 @@
     _bias.allocator()->allocate();
 
     // Get the gate tensors
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0, 0},
+                                      {output_size, batch_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size, 0},
+                                       {2 * output_size, batch_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input,
+                                     {2 * output_size, 0}, {3 * output_size, batch_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size, 0},
+                                       {4 * output_size, batch_size});
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(compile_context, &_output_lowp, &_input_gate_input, {0}, {output_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(compile_context, &_output_lowp, &_forget_gate_input, {output_size},
+                                       {2 * output_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(compile_context, &_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+                                     {3 * output_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(compile_context, &_output_lowp, &_output_gate_input, {3 * output_size},
+                                       {4 * output_size});
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
-    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_output.allocator()->init(
+        TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(compile_context, &_forget_gate_input, &_forget_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
-    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_output.allocator()->init(
+        TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(compile_context, &_input_gate_input, &_input_gate_output,
+                                  ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
-    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_output.allocator()->init(
+        TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(compile_context, &_input_modulation_gate_input, &_input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
-    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_output.allocator()->init(
+        TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(compile_context, &_output_gate_input, &_output_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state_tmp1);
-    _cell_state_tmp1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp1.allocator()->init(
+        TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_forget_gate_cell_state.configure(compile_context, &_forget_gate_output, cell_state_in, &_cell_state_tmp1, 1,
+                                          ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state_tmp2);
-    _cell_state_tmp2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output, &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state_tmp2.allocator()->init(
+        TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul_input_gate_input_mod_gate.configure(compile_context, &_input_gate_output, &_input_modulation_gate_output,
+                                             &_cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
-    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE);
+    _add_cell_state_tmps.configure(compile_context, &_cell_state_tmp1, &_cell_state_tmp2, cell_state_out,
+                                   ConvertPolicy::SATURATE);
     _cell_state_tmp1.allocator()->allocate();
     _cell_state_tmp2.allocator()->allocate();
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
-    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _output_state_tmp.allocator()->init(
+        TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(compile_context, cell_state_out, &_output_state_tmp,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
-    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_state_out_symm.allocator()->init(
+        TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul_output_state_tmp_output_gate.configure(compile_context, &_output_state_tmp, &_output_gate_output,
+                                                &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
-    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _output_state_out_f32.allocator()->init(
+        TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
     _dequantize.configure(compile_context, &_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
@@ -278,15 +393,28 @@
 }
 
 Status CLLSTMLayerQuantized::validate(const ITensorInfo *input,
-                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+                                      const ITensorInfo *input_to_input_weights,
+                                      const ITensorInfo *input_to_forget_weights,
+                                      const ITensorInfo *input_to_cell_weights,
+                                      const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights,
+                                      const ITensorInfo *recurrent_to_forget_weights,
+                                      const ITensorInfo *recurrent_to_cell_weights,
+                                      const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias,
+                                      const ITensorInfo *forget_gate_bias,
+                                      const ITensorInfo *cell_bias,
+                                      const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in,
+                                      const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out,
+                                      const ITensorInfo *output_state_out)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
-                                        output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+        recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+        input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+        output_state_out);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::QASYMM8);
 
     const int input_size  = input->dimension(0);
@@ -299,29 +427,51 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
-    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
-    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
-    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+    TensorInfo input_weights_info(input_to_input_weights->clone()
+                                      ->set_tensor_shape(TensorShape(input_size, output_size))
+                                      .set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+                                          ->set_tensor_shape(TensorShape(output_size, output_size))
+                                          .set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(
+        input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()
+                                     ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                     .set_data_type(DataType::QASYMM8)
+                                     .set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()
+                                   ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                   .set_data_type(DataType::QSYMM16)
+                                   .set_quantization_info(qsymm_4));
 
     // Shape checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+                                                   input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                   recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                   recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                   output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
 
     // Data type checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+                                                       input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                       recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                       recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                       output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input_to_input_weights, input_to_forget_weights,
+                                                              input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                              recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
@@ -343,7 +493,8 @@
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
     const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
 
     // _concat_weights
     std::vector<const ITensorInfo *> weights_vector;
@@ -353,7 +504,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
     // _transpose_weights
     const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
-    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(&weights, &weights_transposed));
 
     // _concat_inputs
@@ -379,7 +530,8 @@
 
     // _gemmlowp
     const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
 
     // Set the offset back
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -390,7 +542,8 @@
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int         output_multiplier = 0;
     int         output_shift      = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
     GEMMLowpOutputStageInfo info{};
@@ -405,68 +558,91 @@
     TensorInfo input_modulation_gate_input;
     TensorInfo output_gate_input;
 
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+                                                      {3 * output_size, batch_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
     }
     else
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLSlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLSlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
     }
 
     // _sigmoid_forget_gate
     const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _sigmoid_input_gate
     const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+        &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _tanh_modulation_gate
-    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+                                                  qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
     // _sigmoid_output_gate
     const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_gate_input, &output_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // _mul_forget_gate_cell_state
     const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     // _mul_input_gate_input_mod_gate
     const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+                                                                    &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _add_cell_state_tmps
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
 
     // _tanh_modulation_gate
     const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &output_state_tmp,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
 
     // _mul_output_state_tmp_output_gate
     const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+                                                                    &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _dequantize
     const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -475,14 +651,14 @@
     // _quantize
     ARM_COMPUTE_RETURN_ON_ERROR(CLQuantizationLayer::validate(&output_state_out_f32, output_state_out));
 
-    if(cell_state_out->total_size() != 0)
+    if (cell_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
     }
 
-    if(output_state_out->total_size() != 0)
+    if (output_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -541,7 +717,7 @@
 
 void CLLSTMLayerQuantized::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _input_weights.allocator()->allocate();
         _concat_input_weights.run();

diff --git a/src/runtime/CL/functions/CLLogicalAnd.cpp b/src/runtime/CL/functions/CLLogicalAnd.cpp
index 696191c..ea21c54 100644
--- a/src/runtime/CL/functions/CLLogicalAnd.cpp
+++ b/src/runtime/CL/functions/CLLogicalAnd.cpp

@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalAnd.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,7 +34,10 @@
 {
 namespace experimental
 {
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ITensorInfo            *input1,
+                             ITensorInfo            *input2,
+                             ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@
 
 struct CLLogicalAnd::Impl
 {
-    const ICLTensor                            *src0{ nullptr };
-    const ICLTensor                            *src1{ nullptr };
-    ICLTensor                                  *dst{ nullptr };
-    std::unique_ptr<experimental::CLLogicalAnd> op{ nullptr };
+    const ICLTensor                            *src0{nullptr};
+    const ICLTensor                            *src1{nullptr};
+    ICLTensor                                  *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalAnd> op{nullptr};
 };
 
-CLLogicalAnd::CLLogicalAnd()
-    : _impl(std::make_unique<Impl>())
+CLLogicalAnd::CLLogicalAnd() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&) = default;
+CLLogicalAnd::CLLogicalAnd(CLLogicalAnd &&)            = default;
 CLLogicalAnd &CLLogicalAnd::operator=(CLLogicalAnd &&) = default;
 CLLogicalAnd::~CLLogicalAnd()                          = default;
 
@@ -73,7 +76,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLLogicalAnd::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalAnd::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input1,
+                             ICLTensor              *input2,
+                             ICLTensor              *output)
 {
     _impl->src0 = input1;
     _impl->src1 = input2;

diff --git a/src/runtime/CL/functions/CLLogicalNot.cpp b/src/runtime/CL/functions/CLLogicalNot.cpp
index a0504d7..71f9cce 100644
--- a/src/runtime/CL/functions/CLLogicalNot.cpp
+++ b/src/runtime/CL/functions/CLLogicalNot.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClLogicalNot.h"
 
@@ -32,16 +33,15 @@
 {
 struct CLLogicalNot::Impl
 {
-    const ICLTensor                      *src{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClLogicalNot> op{ nullptr };
+    const ICLTensor                      *src{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClLogicalNot> op{nullptr};
 };
 
-CLLogicalNot::CLLogicalNot()
-    : _impl(std::make_unique<Impl>())
+CLLogicalNot::CLLogicalNot() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalNot::CLLogicalNot(CLLogicalNot &&) = default;
+CLLogicalNot::CLLogicalNot(CLLogicalNot &&)            = default;
 CLLogicalNot &CLLogicalNot::operator=(CLLogicalNot &&) = default;
 CLLogicalNot::~CLLogicalNot()                          = default;
 
@@ -72,4 +72,4 @@
     _impl->op->run(pack);
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLLogicalOr.cpp b/src/runtime/CL/functions/CLLogicalOr.cpp
index f9a606e..3db4fda 100644
--- a/src/runtime/CL/functions/CLLogicalOr.cpp
+++ b/src/runtime/CL/functions/CLLogicalOr.cpp

@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLLogicalOr.h"
+
 #include "arm_compute/core/CL/ICLTensor.h"
-#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/gpu/cl/kernels/ClElementwiseKernel.h"
 
 #include <utility>
 
@@ -33,7 +34,10 @@
 {
 namespace experimental
 {
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ITensorInfo            *input1,
+                            ITensorInfo            *input2,
+                            ITensorInfo            *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output);
     auto k = std::make_unique<arm_compute::opencl::kernels::ClLogicalBinaryKernel>();
@@ -54,17 +58,16 @@
 
 struct CLLogicalOr::Impl
 {
-    const ICLTensor                           *src0{ nullptr };
-    const ICLTensor                           *src1{ nullptr };
-    ICLTensor                                 *dst{ nullptr };
-    std::unique_ptr<experimental::CLLogicalOr> op{ nullptr };
+    const ICLTensor                           *src0{nullptr};
+    const ICLTensor                           *src1{nullptr};
+    ICLTensor                                 *dst{nullptr};
+    std::unique_ptr<experimental::CLLogicalOr> op{nullptr};
 };
 
-CLLogicalOr::CLLogicalOr()
-    : _impl(std::make_unique<Impl>())
+CLLogicalOr::CLLogicalOr() : _impl(std::make_unique<Impl>())
 {
 }
-CLLogicalOr::CLLogicalOr(CLLogicalOr &&) = default;
+CLLogicalOr::CLLogicalOr(CLLogicalOr &&)            = default;
 CLLogicalOr &CLLogicalOr::operator=(CLLogicalOr &&) = default;
 CLLogicalOr::~CLLogicalOr()                         = default;
 
@@ -73,7 +76,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output);
 }
 
-void CLLogicalOr::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output)
+void CLLogicalOr::configure(const CLCompileContext &compile_context,
+                            ICLTensor              *input1,
+                            ICLTensor              *input2,
+                            ICLTensor              *output)
 {
     _impl->src0 = input1;
     _impl->src1 = input2;

diff --git a/src/runtime/CL/functions/CLMatMul.cpp b/src/runtime/CL/functions/CLMatMul.cpp
index bef422f..e8bdad7 100644
--- a/src/runtime/CL/functions/CLMatMul.cpp
+++ b/src/runtime/CL/functions/CLMatMul.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLMatMul.h"
+
 #include "arm_compute/runtime/CL/CLTensor.h"
 #include "arm_compute/runtime/CL/CLTypes.h"
+
 #include "src/gpu/cl/operators/ClMatMul.h"
 
 namespace arm_compute
@@ -32,23 +34,32 @@
 
 struct CLMatMul::Impl
 {
-    std::unique_ptr<OperatorType> op{ nullptr };
+    std::unique_ptr<OperatorType> op{nullptr};
     ITensorPack                   run_pack{};
 };
-CLMatMul::CLMatMul()
-    : _impl(std::make_unique<Impl>())
+CLMatMul::CLMatMul() : _impl(std::make_unique<Impl>())
 {
 }
 
 CLMatMul::~CLMatMul() = default;
 
-void CLMatMul::configure(ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void CLMatMul::configure(ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(settings);
     configure(CLKernelLibrary::get().get_compile_context(), lhs, rhs, output, matmul_info, settings, act_info);
 }
 
-void CLMatMul::configure(const CLCompileContext &compile_context, ICLTensor *lhs, ICLTensor *rhs, ICLTensor *output, const MatMulInfo &matmul_info, const GpuMatMulSettings &settings,
+void CLMatMul::configure(const CLCompileContext    &compile_context,
+                         ICLTensor                 *lhs,
+                         ICLTensor                 *rhs,
+                         ICLTensor                 *output,
+                         const MatMulInfo          &matmul_info,
+                         const GpuMatMulSettings   &settings,
                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, output);
@@ -56,10 +67,14 @@
 
     _impl->op = std::make_unique<OperatorType>();
     _impl->op->configure(compile_context, lhs->info(), rhs->info(), output->info(), matmul_info, act_info);
-    _impl->run_pack = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
+    _impl->run_pack = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
 }
 
-Status CLMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info)
+Status CLMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *output,
+                          const MatMulInfo          &matmul_info,
+                          const ActivationLayerInfo &act_info)
 {
     return OperatorType::validate(lhs, rhs, output, matmul_info, act_info);
 }

diff --git a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
index 2786d32..7494f37 100644
--- a/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLMaxUnpoolingLayer.cpp

@@ -27,26 +27,32 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMaxUnpoolingLayerKernel.h"
 
 namespace arm_compute
 {
 CLMaxUnpoolingLayer::CLMaxUnpoolingLayer()
-    : _fill(),
-      _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
+    : _fill(), _unpooling_layer_kernel(std::make_unique<CLMaxUnpoolingLayerKernel>())
 {
 }
 
 CLMaxUnpoolingLayer::~CLMaxUnpoolingLayer() = default;
 
-void CLMaxUnpoolingLayer::configure(ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, indices, output, pool_info);
 }
 
-void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *indices, ICLTensor *output, const PoolingLayerInfo &pool_info)
+void CLMaxUnpoolingLayer::configure(const CLCompileContext &compile_context,
+                                    ICLTensor              *input,
+                                    ICLTensor              *indices,
+                                    ICLTensor              *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
     const PixelValue zero_value(0.f);
@@ -55,7 +61,10 @@
     _unpooling_layer_kernel->configure(compile_context, input, indices, output, pool_info);
 }
 
-Status CLMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status CLMaxUnpoolingLayer::validate(const ITensorInfo      *input,
+                                     const ITensorInfo      *indices,
+                                     const ITensorInfo      *output,
+                                     const PoolingLayerInfo &pool_info)
 {
     return CLMaxUnpoolingLayerKernel::validate(input, indices, output, pool_info);
 }

diff --git a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
index a81cbca..5892c0e 100644
--- a/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLMeanStdDevNormalizationLayer.cpp

@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLMeanStdDevNormalizationLayer.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLMeanStdDevNormalizationKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, epsilon);
 }
 
-void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, float epsilon)
+void CLMeanStdDevNormalizationLayer::configure(const CLCompileContext &compile_context,
+                                               ICLTensor              *input,
+                                               ICLTensor              *output,
+                                               float                   epsilon)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, epsilon);
     auto k = std::make_unique<CLMeanStdDevNormalizationKernel>();

diff --git a/src/runtime/CL/functions/CLNormalizationLayer.cpp b/src/runtime/CL/functions/CLNormalizationLayer.cpp
index c0cc518..f93f82f 100644
--- a/src/runtime/CL/functions/CLNormalizationLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizationLayer.cpp

@@ -30,10 +30,10 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
-#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
+#include "src/core/CL/kernels/CLNormalizationLayerKernel.h"
 
 namespace arm_compute
 {
@@ -50,7 +50,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, norm_info);
 }
 
-void CLNormalizationLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const NormalizationLayerInfo &norm_info)
+void CLNormalizationLayer::configure(const CLCompileContext       &compile_context,
+                                     ICLTensor                    *input,
+                                     ICLTensor                    *output,
+                                     const NormalizationLayerInfo &norm_info)
 {
     ARM_COMPUTE_ERROR_ON(input == nullptr);
     ARM_COMPUTE_LOG_PARAMS(input, output, norm_info);
@@ -58,21 +61,24 @@
     // Configure normalization kernel
     _norm_kernel->configure(compile_context, input, output, norm_info);
 
-    if(!_norm_kernel->border_size().empty())
+    if (!_norm_kernel->border_size().empty())
     {
         // Fill the border by 3 elements since we need vload4 in the IN_MAP normalization kernel
-        _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT, PixelValue());
+        _border_handler->configure(compile_context, input, _norm_kernel->border_size(), BorderMode::CONSTANT,
+                                   PixelValue());
     }
 }
 
-Status CLNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status CLNormalizationLayer::validate(const ITensorInfo            *input,
+                                      const ITensorInfo            *output,
+                                      const NormalizationLayerInfo &norm_info)
 {
     return CLNormalizationLayerKernel::validate(input, output, norm_info);
 }
 
 void CLNormalizationLayer::run()
 {
-    if(!_norm_kernel->border_size().empty())
+    if (!_norm_kernel->border_size().empty())
     {
         // Run border handler
         CLScheduler::get().enqueue(*_border_handler, false);

diff --git a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
index 63c9164..939c95b 100644
--- a/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp
+++ b/src/runtime/CL/functions/CLNormalizePlanarYUVLayer.cpp

@@ -24,20 +24,26 @@
 
 #include "arm_compute/runtime/CL/functions/CLNormalizePlanarYUVLayer.h"
 
-#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLNormalizePlanarYUVLayerKernel.h"
 
 #include <utility>
 
 namespace arm_compute
 {
-void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const ICLTensor *input,
+                                          ICLTensor       *output,
+                                          const ICLTensor *mean,
+                                          const ICLTensor *std)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, mean, std);
 }
 
-void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *mean, const ICLTensor *std)
+void CLNormalizePlanarYUVLayer::configure(const CLCompileContext &compile_context,
+                                          const ICLTensor        *input,
+                                          ICLTensor              *output,
+                                          const ICLTensor        *mean,
+                                          const ICLTensor        *std)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, std);
     auto k = std::make_unique<CLNormalizePlanarYUVLayerKernel>();
@@ -45,8 +51,10 @@
     _kernel = std::move(k);
 }
 
-Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                           const ITensorInfo *mean, const ITensorInfo *std)
+Status CLNormalizePlanarYUVLayer::validate(const ITensorInfo *input,
+                                           const ITensorInfo *output,
+                                           const ITensorInfo *mean,
+                                           const ITensorInfo *std)
 {
     return CLNormalizePlanarYUVLayerKernel::validate(input, output, mean, std);
 }

diff --git a/src/runtime/CL/functions/CLPReluLayer.cpp b/src/runtime/CL/functions/CLPReluLayer.cpp
index 186e7b4..ce6d285 100644
--- a/src/runtime/CL/functions/CLPReluLayer.cpp
+++ b/src/runtime/CL/functions/CLPReluLayer.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPReluLayer.h"
+
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/gpu/cl/IClKernel.h"
 #include "src/gpu/cl/operators/ClPRelu.h"
 
@@ -33,17 +35,16 @@
 
 struct CLPReluLayer::Impl
 {
-    const ICLTensor              *src_0{ nullptr };
-    const ICLTensor              *src_1{ nullptr };
-    ICLTensor                    *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *src_0{nullptr};
+    const ICLTensor              *src_1{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
 };
 
-CLPReluLayer::CLPReluLayer()
-    : _impl(std::make_unique<Impl>())
+CLPReluLayer::CLPReluLayer() : _impl(std::make_unique<Impl>())
 {
 }
-CLPReluLayer::CLPReluLayer(CLPReluLayer &&) = default;
+CLPReluLayer::CLPReluLayer(CLPReluLayer &&)            = default;
 CLPReluLayer &CLPReluLayer::operator=(CLPReluLayer &&) = default;
 CLPReluLayer::~CLPReluLayer()                          = default;
 
@@ -52,13 +53,17 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, alpha, output);
 }
 
-void CLPReluLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *alpha, ICLTensor *output)
+void CLPReluLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *alpha,
+                             ICLTensor              *output)
 {
     _impl->src_0 = input;
     _impl->src_1 = alpha;
     _impl->dst   = output;
     _impl->op    = std::make_unique<OperatorType>();
-    _impl->op->configure(compile_context, input->info(), alpha->info(), (output == nullptr ? input->info() : output->info()));
+    _impl->op->configure(compile_context, input->info(), alpha->info(),
+                         (output == nullptr ? input->info() : output->info()));
 }
 
 Status CLPReluLayer::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output)

diff --git a/src/runtime/CL/functions/CLPadLayer.cpp b/src/runtime/CL/functions/CLPadLayer.cpp
index 0ed8f03..e788ded 100644
--- a/src/runtime/CL/functions/CLPadLayer.cpp
+++ b/src/runtime/CL/functions/CLPadLayer.cpp

@@ -22,37 +22,38 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLPadLayer.h"
-#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLPadLayerKernel.h"
 
 namespace arm_compute
 {
-CLPadLayer::CLPadLayer()
-    : _pad_kernel(std::make_unique<CLPadLayerKernel>()),
-      _copy(),
-      _perform_pad(false)
+CLPadLayer::CLPadLayer() : _pad_kernel(std::make_unique<CLPadLayerKernel>()), _copy(), _perform_pad(false)
 {
 }
 
 CLPadLayer::~CLPadLayer() = default;
 
-void CLPadLayer::configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(
+    ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, padding, constant_value, mode);
 }
 
-void CLPadLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+void CLPadLayer::configure(const CLCompileContext &compile_context,
+                           ICLTensor              *input,
+                           ICLTensor              *output,
+                           const PaddingList      &padding,
+                           PixelValue              constant_value,
+                           PaddingMode             mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
     ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
 
-    _perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    _perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(_perform_pad)
+    if (_perform_pad)
     {
         _pad_kernel->configure(compile_context, input, output, padding, constant_value, mode);
     }
@@ -62,14 +63,16 @@
         _copy.configure(compile_context, input, output);
     }
 }
-Status CLPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, PixelValue constant_value, PaddingMode mode)
+Status CLPadLayer::validate(const ITensorInfo *input,
+                            const ITensorInfo *output,
+                            const PaddingList &padding,
+                            PixelValue         constant_value,
+                            PaddingMode        mode)
 {
-    bool perform_pad = std::any_of(padding.begin(), padding.end(), [](PaddingInfo info)
-    {
-        return info.first > 0 || info.second > 0;
-    });
+    bool perform_pad =
+        std::any_of(padding.begin(), padding.end(), [](PaddingInfo info) { return info.first > 0 || info.second > 0; });
 
-    if(perform_pad)
+    if (perform_pad)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLPadLayerKernel::validate(input, output, padding, constant_value, mode));
     }
@@ -81,7 +84,7 @@
 }
 void CLPadLayer::run()
 {
-    if(_perform_pad)
+    if (_perform_pad)
     {
         CLScheduler::get().enqueue(*_pad_kernel);
     }

diff --git a/src/runtime/CL/functions/CLPermute.cpp b/src/runtime/CL/functions/CLPermute.cpp
index a56afff..7f97eed 100644
--- a/src/runtime/CL/functions/CLPermute.cpp
+++ b/src/runtime/CL/functions/CLPermute.cpp

@@ -27,22 +27,21 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/ICLKernel.h"
-#include "src/gpu/cl/operators/ClPermute.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/ICLKernel.h"
+#include "src/gpu/cl/operators/ClPermute.h"
 
 namespace arm_compute
 {
 struct CLPermute::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClPermute> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClPermute> op{nullptr};
 };
 
-CLPermute::CLPermute()
-    : _impl(std::make_unique<Impl>())
+CLPermute::CLPermute() : _impl(std::make_unique<Impl>())
 {
 }
 
@@ -53,7 +52,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, perm);
 }
 
-void CLPermute::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
+void CLPermute::configure(const CLCompileContext  &compile_context,
+                          const ICLTensor         *input,
+                          ICLTensor               *output,
+                          const PermutationVector &perm)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, perm);

diff --git a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
index 9d91e58..6aa9d9c 100644
--- a/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp
+++ b/src/runtime/CL/functions/CLPixelWiseMultiplication.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClMul.h"
 
@@ -34,38 +35,55 @@
 {
 struct CLPixelWiseMultiplication::Impl
 {
-    const ICLTensor               *src_0{ nullptr };
-    const ICLTensor               *src_1{ nullptr };
-    ICLTensor                     *dst{ nullptr };
-    std::unique_ptr<opencl::ClMul> op{ nullptr };
+    const ICLTensor               *src_0{nullptr};
+    const ICLTensor               *src_1{nullptr};
+    ICLTensor                     *dst{nullptr};
+    std::unique_ptr<opencl::ClMul> op{nullptr};
 };
 
-CLPixelWiseMultiplication::CLPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+CLPixelWiseMultiplication::CLPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
-CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&) = default;
+CLPixelWiseMultiplication::CLPixelWiseMultiplication(CLPixelWiseMultiplication &&)            = default;
 CLPixelWiseMultiplication &CLPixelWiseMultiplication::operator=(CLPixelWiseMultiplication &&) = default;
 CLPixelWiseMultiplication::~CLPixelWiseMultiplication()                                       = default;
 
-void CLPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
+    configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, scale, overflow_policy,
+              rounding_policy, act_info);
 }
 
-void CLPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, float scale,
-                                          ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+void CLPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                          ICLTensor                 *input1,
+                                          ICLTensor                 *input2,
+                                          ICLTensor                 *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
+                                          const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
     _impl->op    = std::make_unique<opencl::ClMul>();
-    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+    _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), scale, overflow_policy,
+                         rounding_policy, act_info);
 }
 
-Status CLPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale,
-                                           ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info)
+Status CLPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                           const ITensorInfo         *input2,
+                                           const ITensorInfo         *output,
+                                           float                      scale,
+                                           ConvertPolicy              overflow_policy,
+                                           RoundingPolicy             rounding_policy,
+                                           const ActivationLayerInfo &act_info)
 {
     return opencl::ClMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
@@ -82,26 +100,33 @@
 
 struct CLComplexPixelWiseMultiplication::Impl
 {
-    const ICLTensor                      *src_0{ nullptr };
-    const ICLTensor                      *src_1{ nullptr };
-    ICLTensor                            *dst{ nullptr };
-    std::unique_ptr<opencl::ClComplexMul> op{ nullptr };
+    const ICLTensor                      *src_0{nullptr};
+    const ICLTensor                      *src_1{nullptr};
+    ICLTensor                            *dst{nullptr};
+    std::unique_ptr<opencl::ClComplexMul> op{nullptr};
 };
 
-CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 CLComplexPixelWiseMultiplication::CLComplexPixelWiseMultiplication(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication &CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
-CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()                                              = default;
+CLComplexPixelWiseMultiplication &
+CLComplexPixelWiseMultiplication::operator=(CLComplexPixelWiseMultiplication &&) = default;
+CLComplexPixelWiseMultiplication::~CLComplexPixelWiseMultiplication()            = default;
 
-void CLComplexPixelWiseMultiplication::configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplication::configure(ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, act_info);
 }
 
-void CLComplexPixelWiseMultiplication::configure(const CLCompileContext &compile_context, ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info)
+void CLComplexPixelWiseMultiplication::configure(const CLCompileContext    &compile_context,
+                                                 ICLTensor                 *input1,
+                                                 ICLTensor                 *input2,
+                                                 ICLTensor                 *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
@@ -110,7 +135,10 @@
     _impl->op->configure(compile_context, input1->info(), input2->info(), output->info(), act_info);
 }
 
-Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status CLComplexPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                                  const ITensorInfo         *input2,
+                                                  const ITensorInfo         *output,
+                                                  const ActivationLayerInfo &act_info)
 {
     return opencl::ClComplexMul::validate(input1, input2, output, act_info);
 }

diff --git a/src/runtime/CL/functions/CLPooling3dLayer.cpp b/src/runtime/CL/functions/CLPooling3dLayer.cpp
index 11ae1d0..ce1092a 100644
--- a/src/runtime/CL/functions/CLPooling3dLayer.cpp
+++ b/src/runtime/CL/functions/CLPooling3dLayer.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClPool3d.h"
 
@@ -32,14 +33,13 @@
 {
 struct CLPooling3dLayer::Impl
 {
-    const ICLTensor                  *src{ nullptr };
-    ICLTensor                        *dst{ nullptr };
-    ICLTensor                        *indices{ nullptr };
-    std::unique_ptr<opencl::ClPool3d> op{ nullptr };
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool3d> op{nullptr};
 };
 
-CLPooling3dLayer::CLPooling3dLayer()
-    : _impl(std::make_unique<Impl>())
+CLPooling3dLayer::CLPooling3dLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLPooling3dLayer::~CLPooling3dLayer() = default;
@@ -49,7 +49,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info);
 }
 
-void CLPooling3dLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Pooling3dLayerInfo &pool_info)
+void CLPooling3dLayer::configure(const CLCompileContext   &compile_context,
+                                 const ICLTensor          *input,
+                                 ICLTensor                *output,
+                                 const Pooling3dLayerInfo &pool_info)
 {
     _impl->src = input;
     _impl->dst = output;
@@ -58,7 +61,8 @@
     _impl->op->configure(compile_context, input->info(), output->info(), pool_info);
 }
 
-Status CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+Status
+CLPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
 {
     return opencl::ClPool3d::validate(input, output, pool_info);
 }

diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 0ebce31..65e53b9 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClPool2d.h"
 
@@ -32,34 +33,44 @@
 {
 struct CLPoolingLayer::Impl
 {
-    const ICLTensor                  *src{ nullptr };
-    ICLTensor                        *dst{ nullptr };
-    ICLTensor                        *indices{ nullptr };
-    std::unique_ptr<opencl::ClPool2d> op{ nullptr };
+    const ICLTensor                  *src{nullptr};
+    ICLTensor                        *dst{nullptr};
+    ICLTensor                        *indices{nullptr};
+    std::unique_ptr<opencl::ClPool2d> op{nullptr};
 };
 
-CLPoolingLayer::CLPoolingLayer()
-    : _impl(std::make_unique<Impl>())
+CLPoolingLayer::CLPoolingLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLPoolingLayer::~CLPoolingLayer() = default;
 
-void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, pool_info, indices);
 }
 
-void CLPoolingLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info, ICLTensor *indices)
+void CLPoolingLayer::configure(const CLCompileContext &compile_context,
+                               ICLTensor              *input,
+                               ICLTensor              *output,
+                               const PoolingLayerInfo &pool_info,
+                               ICLTensor              *indices)
 {
     _impl->src     = input;
     _impl->dst     = output;
     _impl->indices = indices;
 
     _impl->op = std::make_unique<opencl::ClPool2d>();
-    _impl->op->configure(compile_context, input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
+    _impl->op->configure(compile_context, input->info(), output->info(), pool_info,
+                         (indices) ? indices->info() : nullptr);
 }
 
-Status CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status CLPoolingLayer::validate(const ITensorInfo      *input,
+                                const ITensorInfo      *output,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     return opencl::ClPool2d::validate(input, output, pool_info, indices);
 }

diff --git a/src/runtime/CL/functions/CLPriorBoxLayer.cpp b/src/runtime/CL/functions/CLPriorBoxLayer.cpp
index 019f0a7..cfd0ec4 100644
--- a/src/runtime/CL/functions/CLPriorBoxLayer.cpp
+++ b/src/runtime/CL/functions/CLPriorBoxLayer.cpp

@@ -29,31 +29,40 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLPriorBoxLayerKernel.h"
 
-#include "src/common/utils/Log.h"
-
 using namespace arm_compute;
 
-CLPriorBoxLayer::CLPriorBoxLayer()
-    : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
+CLPriorBoxLayer::CLPriorBoxLayer() : _min(nullptr), _max(nullptr), _aspect_ratios(nullptr)
 {
 }
 
-void CLPriorBoxLayer::configure(const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input1, input2, output, info);
 }
 
-void CLPriorBoxLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input1, const ICLTensor *input2, ICLTensor *output, const PriorBoxLayerInfo &info)
+void CLPriorBoxLayer::configure(const CLCompileContext  &compile_context,
+                                const ICLTensor         *input1,
+                                const ICLTensor         *input2,
+                                ICLTensor               *output,
+                                const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
-    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.min_sizes().size() * sizeof(float));
-    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.aspect_ratios().size() * sizeof(float));
-    if(!info.max_sizes().empty())
+    _min           = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.min_sizes().size() * sizeof(float));
+    _aspect_ratios = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                                info.aspect_ratios().size() * sizeof(float));
+    if (!info.max_sizes().empty())
     {
-        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE, info.max_sizes().size() * sizeof(float));
+        _max = cl::Buffer(CLScheduler::get().context(), CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_WRITE,
+                          info.max_sizes().size() * sizeof(float));
     }
 
     auto k = std::make_unique<CLPriorBoxLayerKernel>();
@@ -61,7 +70,10 @@
     _kernel = std::move(k);
 }
 
-Status CLPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status CLPriorBoxLayer::validate(const ITensorInfo       *input1,
+                                 const ITensorInfo       *input2,
+                                 const ITensorInfo       *output,
+                                 const PriorBoxLayerInfo &info)
 {
     return CLPriorBoxLayerKernel::validate(input1, input2, output, info);
 }

diff --git a/src/runtime/CL/functions/CLQLSTMLayer.cpp b/src/runtime/CL/functions/CLQLSTMLayer.cpp
index 7fbb866..12f6f89 100644
--- a/src/runtime/CL/functions/CLQLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLQLSTMLayer.cpp

@@ -26,29 +26,36 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLQLSTMLayerNormalizationKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 using namespace arm_compute::utils::info_helpers;
 using namespace arm_compute::opencl::kernels;
 namespace
 {
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
-                   float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+                   const ITensorInfo       *mm_input,
+                   const ITensorInfo       *mm_weights,
+                   const ITensorInfo       *bias,
+                   float                    gemmlowp_scale,
+                   const TensorInfo        *mm_res_info,
+                   const TensorInfo        *outstage_tensor_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+        gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
     return Status{};
 }
 } // namespace
@@ -78,14 +85,12 @@
     _src->map(q, true);
     _dst->map(q, true);
 
-    Iterator input_iter{ _src, _window };
-    Iterator output_iter{ _dst, _window };
+    Iterator input_iter{_src, _window};
+    Iterator output_iter{_dst, _window};
 
-    execute_window_loop(_window, [&](const Coordinates &)
-    {
-        memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
-    },
-    input_iter, output_iter);
+    execute_window_loop(
+        _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+        output_iter);
 
     _src->unmap(q);
     _dst->unmap(q);
@@ -104,7 +109,7 @@
       _layer_norms(),
       _copy_output()
 {
-    for(auto &norm : _layer_norms)
+    for (auto &norm : _layer_norms)
     {
         norm = std::make_unique<CLQLSTMLayerNormalizationKernel>();
     }
@@ -129,17 +134,22 @@
 {
     // Output quantization scale will be different, but ignored here
     // since it will be configured at configure() stage.
-    const TensorInfo out
-    {
-        in
-    };
+    const TensorInfo out{in};
     return CLQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
 }
 
-void CLQLSTMLayer::configure_mm(const CLCompileContext &compile_context, CLGEMMLowpMatrixMultiplyCore &mm, CLGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                                const ICLTensor *mm_input, const ICLTensor *mm_weights, const ICLTensor *bias,
-                                CLTensor *mm_res, CLTensor *outstage_res, float gemmlowp_scale,
-                                const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void CLQLSTMLayer::configure_mm(const CLCompileContext       &compile_context,
+                                CLGEMMLowpMatrixMultiplyCore &mm,
+                                CLGEMMLowpOutputStage        &outstage,
+                                GEMMLowpOutputStageInfo      &gemmlowp_info,
+                                const ICLTensor              *mm_input,
+                                const ICLTensor              *mm_weights,
+                                const ICLTensor              *bias,
+                                CLTensor                     *mm_res,
+                                CLTensor                     *outstage_res,
+                                float                         gemmlowp_scale,
+                                const TensorInfo             &mm_res_info,
+                                const TensorInfo             &outstage_tensor_info)
 {
     _memory_group.manage(mm_res);
     _memory_group.manage(outstage_res);
@@ -151,30 +161,51 @@
     mm.configure(compile_context, mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
-    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift);
     outstage.configure(compile_context, mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
-void CLQLSTMLayer::configure(const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-              recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
-              cell_state_in, output_state_in, cell_state_out, output_state_out, output, lstm_params);
+    configure(CLKernelLibrary::get().get_compile_context(), input, input_to_forget_weights, input_to_cell_weights,
+              input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
+              recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
+              output_state_in, cell_state_out, output_state_out, output, lstm_params);
 }
 
-void CLQLSTMLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input,
-                             const ICLTensor *input_to_forget_weights, const ICLTensor *input_to_cell_weights, const ICLTensor *input_to_output_weights,
-                             const ICLTensor *recurrent_to_forget_weights, const ICLTensor *recurrent_to_cell_weights, const ICLTensor *recurrent_to_output_weights,
-                             const ICLTensor *forget_gate_bias, const ICLTensor *cell_bias, const ICLTensor *output_gate_bias,
-                             ICLTensor *cell_state_in, ICLTensor *output_state_in,
-                             ICLTensor *cell_state_out, ICLTensor *output_state_out, ICLTensor *output,
+void CLQLSTMLayer::configure(const CLCompileContext      &compile_context,
+                             const ICLTensor             *input,
+                             const ICLTensor             *input_to_forget_weights,
+                             const ICLTensor             *input_to_cell_weights,
+                             const ICLTensor             *input_to_output_weights,
+                             const ICLTensor             *recurrent_to_forget_weights,
+                             const ICLTensor             *recurrent_to_cell_weights,
+                             const ICLTensor             *recurrent_to_output_weights,
+                             const ICLTensor             *forget_gate_bias,
+                             const ICLTensor             *cell_bias,
+                             const ICLTensor             *output_gate_bias,
+                             ICLTensor                   *cell_state_in,
+                             ICLTensor                   *output_state_in,
+                             ICLTensor                   *cell_state_out,
+                             ICLTensor                   *output_state_out,
+                             ICLTensor                   *output,
                              const LSTMParams<ICLTensor> &lstm_params)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
@@ -191,11 +222,11 @@
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                      recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                      forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                      cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                      lstm_params_info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLQLSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(), lstm_params_info));
 
     const int batch_size  = input->info()->dimension(1);
     const int num_units   = input_to_output_weights->info()->dimension(1);
@@ -216,7 +247,7 @@
 
     // Layer normalization
     _has_layer_norm = lstm_params.use_layer_norm();
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
         set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -238,53 +269,75 @@
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
     _has_cell_clipping = quantized_cell_clip > 0;
 
     // Precompute effective bias for optimizing the matmul computations.
-    if(!_has_cifg)
+    if (!_has_cifg)
     {
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
-        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction->configure(compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                                 -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(compile_context, _input_to_input_weights->info(),
+                                             _input_to_input_eff_bias.info(),
+                                             GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(
+            compile_context, _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
-    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction->configure(compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                              -qoutput_state_in.offset, true));
-    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction->configure(compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                            true));
-    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction->configure(compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false,
-                                              -qoutput_state_in.offset, true));
-    if(_has_projection)
+    _input_to_forget_reduction->configure(compile_context, input_to_forget_weights->info(),
+                                          _input_to_forget_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(
+        compile_context, recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(compile_context, input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+                                        GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(
+        compile_context, recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(compile_context, input_to_output_weights->info(),
+                                          _input_to_output_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(
+        compile_context, recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    if (_has_projection)
     {
-        _projection_reduction->configure(compile_context, _projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
-        if(_projection_bias != nullptr)
+        _projection_reduction->configure(
+            compile_context, _projection_weights->info(), _projection_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if (_projection_bias != nullptr)
         {
-            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+            _projection_bias_add.configure(compile_context, _projection_bias, &_projection_eff_bias,
+                                           &_projection_eff_bias, ConvertPolicy::SATURATE);
         }
     }
 
     // Pre-transpose weights to be used in GEMM.
-    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights, &_input_to_forget_weights_transposed);
-    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights, &_input_to_cell_weights_transposed);
-    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
-    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
-    if(!_has_cifg)
+    _transpose_input_to_forget_weights.configure(compile_context, input_to_forget_weights,
+                                                 &_input_to_forget_weights_transposed);
+    _transpose_input_to_cell_weights.configure(compile_context, input_to_cell_weights,
+                                               &_input_to_cell_weights_transposed);
+    _transpose_input_to_output_weights.configure(compile_context, input_to_output_weights,
+                                                 &_input_to_output_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(compile_context, recurrent_to_forget_weights,
+                                                     &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_cell_weights.configure(compile_context, recurrent_to_cell_weights,
+                                                   &_recurrent_to_cell_weights_transposed);
+    _transpose_recurrent_to_output_weights.configure(compile_context, recurrent_to_output_weights,
+                                                     &_recurrent_to_output_weights_transposed);
+    if (!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(compile_context, lstm_params.input_to_input_weights(),
+                                                    &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(compile_context, lstm_params.recurrent_to_input_weights(),
+                                                        &_recurrent_to_input_weights_transposed);
     }
-    if(_has_projection)
+    if (_has_projection)
     {
         _transpose_projection_weights.configure(compile_context, _projection_weights, &_projection_weights_transposed);
     }
@@ -297,42 +350,55 @@
 
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     // Forget gate.
-    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
-    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
-                 input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
-                 &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
+    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                               QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+                 &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+                 &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
                  &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
                  mm_out_info, forget_gate_outstage_info);
 
-    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+    _accumulate_input_recurrent_forget.configure(compile_context, &_input_to_forget_outstage_res,
+                                                 &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_forget.configure(compile_context, cell_state_in, lstm_params.cell_to_forget_weights(),
+                                                &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+        _cell_to_forget_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        const float cell_to_forget_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.forget_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_forget_outstage.configure(compile_context, &_mul_cell_to_forget_res, nullptr,
+                                           &_cell_to_forget_outstage_res, gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+        _accumulate_cell_forget.configure(compile_context, &_recurrent_to_forget_outstage_res,
+                                          &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
                                           ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
     CLTensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Forget, &_recurrent_to_forget_outstage_res);
         _recurrent_to_forget_outstage_res.allocator()->allocate();
@@ -345,30 +411,33 @@
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(compile_context, forget_activation_input, &_forget_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_activation_input->allocator()->allocate();
 
     // Modulation gate.
-    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
-                 input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
-                 &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
+    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+                                      qinput.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input,
+                 &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias, &_mm_input_to_cell_res,
+                 &_input_to_cell_outstage_res, input_to_cell_scale, mm_out_info, cell_outstage_info);
 
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
-                 &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(compile_context, _mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+                 &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
 
-    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+    _accumulate_input_recurrent_modulation.configure(compile_context, &_input_to_cell_outstage_res,
+                                                     &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
                                                      ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     CLTensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Cell, &_recurrent_to_cell_outstage_res);
         _recurrent_to_cell_outstage_res.allocator()->allocate();
@@ -378,14 +447,15 @@
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(compile_context, cell_activation_input, &_cell_gate,
+                              ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     cell_activation_input->allocator()->allocate();
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _input_gate.allocator()->init(input_gate_info);
     _memory_group.manage(&_input_gate);
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
         _input_gate_sub.configure(compile_context, &_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -393,107 +463,142 @@
     }
     else
     {
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
-                     input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
-                     &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
-                     mm_out_info, input_outstage_info);
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        configure_mm(compile_context, _mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+                     &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+                     &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
 
-        const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
+        const float recurrent_to_input_scale =
+            _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
         configure_mm(compile_context, _mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
                      output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res,
-                                                    ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(compile_context, &_input_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+            _mul_cell_to_input_res.allocator()->init(
+                TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-            _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+            _pixelwise_mul_cell_to_input.configure(compile_context, cell_state_in, lstm_params.cell_to_input_weights(),
+                                                   &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
+            const float cell_to_input_scale =
+                std::pow(2, cell_shift) *
+                lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+                lstm_params.input_intermediate_scale();
+            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                         &gemmlowp_info.gemmlowp_shift);
+            _cell_to_input_outstage_res.allocator()->init(
+                TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                           QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(compile_context, &_mul_cell_to_input_res, nullptr,
+                                              &_cell_to_input_outstage_res, gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+                                             &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
         CLTensor *input_activation_input = &_recurrent_to_input_outstage_res;
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             configure_layer_norm(LayerNormGate::Input, &_recurrent_to_input_outstage_res);
             _recurrent_to_input_outstage_res.allocator()->allocate();
             input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
         }
 
-        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _input_gate_sigmoid.configure(compile_context, input_activation_input, &_input_gate,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         input_activation_input->allocator()->allocate();
     }
     // Cell.
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
-    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_forget_cell.configure(compile_context, &_forget_gate, cell_state_in, &_forget_gate, 1.f,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
-    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                         QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(compile_context, &_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f,
+                                        ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
-    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
+    _add_forget_cell.configure(compile_context, &_forget_gate, &_mul_input_cell_res, cell_state_out,
+                               ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
-        _cell_clip.configure(compile_context, cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(compile_context, cell_state_out, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
-                 input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
-                 &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
-                 mm_out_info, output_outstage_info);
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.output_intermediate_scale();
+    configure_mm(compile_context, _mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+                 &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+                 &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
 
-    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
     configure_mm(compile_context, _mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
                  output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
                  &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
                  mm_out_info, output_outstage_info);
 
-    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+    _accumulate_input_recurrent_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                                 &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                                  ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_cell_to_output.configure(compile_context, cell_state_out, lstm_params.cell_to_output_weights(),
+                                                &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
 
-        const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+        const float cell_to_output_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.output_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_output_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_output_outstage_res);
-        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+        _cell_to_output_outstage.configure(compile_context, &_mul_cell_to_output_res, nullptr,
+                                           &_cell_to_output_outstage_res, gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
+        _accumulate_cell_to_output.configure(compile_context, &_recurrent_to_output_outstage_res,
+                                             &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res,
                                              ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
 
     CLTensor *output_activation_input = &_recurrent_to_output_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Output, &_recurrent_to_output_outstage_res);
         _recurrent_to_output_outstage_res.allocator()->allocate();
@@ -503,20 +608,24 @@
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(compile_context, output_activation_input, &_output_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_activation_input->allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _hidden_tanh.configure(compile_context, cell_state_out, &_input_gate,
+                           ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     // TODO(COMPMID-3396): Perform multiplication in the quantized domain in CLPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(compile_context, &_output_gate, &_input_gate, &_hidden_mul_res, 1.f,
+                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
 
@@ -525,7 +634,7 @@
 
     _memory_group.manage(&_hidden_gate);
 
-    if(_projection_tensor_copy_required)
+    if (_projection_tensor_copy_required)
     {
         _hidden_gate.allocator()->init(*output_state_out->info());
         _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -536,27 +645,26 @@
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         const TensorInfo              projection_outstage_info(*output_state_out->info());
-        const UniformQuantizationInfo qprojection      = _projection_weights->info()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        gemmlowp_info.gemmlowp_offset                  = qoutput_state_in.offset;
-        gemmlowp_info.gemmlowp_min_bound               = std::numeric_limits<int8_t>::lowest();
-        gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
-        gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+        const float projection_scale  = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+        gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+        gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+        gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info,
-                     hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
-                     &_mm_projection_res, &_projection_outstage_res, projection_scale,
-                     projection_mm_out_info, projection_outstage_info);
+        configure_mm(compile_context, _mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+                     &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+                     &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
 
         ICLTensor *accumulate_destination = output_state_out;
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
             _projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -565,31 +673,34 @@
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(compile_context, &_projection_outstage_res, accumulate_destination,
+                                         accumulate_destination, ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
             _projection_accumulate_res.allocator()->allocate();
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
-            quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+            quantized_projection_clip =
+                utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            _projection_clip.configure(compile_context, output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                       quantized_projection_clip));
+            _projection_clip.configure(compile_context, output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -quantized_projection_clip, quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
             _hidden_gate.allocator()->allocate();
@@ -600,17 +711,27 @@
     _copy_output.configure(compile_context, output_state_out, output);
 }
 
-Status CLQLSTMLayer::validate(const ITensorInfo *input,
-                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                              const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                              const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status CLQLSTMLayer::validate(const ITensorInfo             *input,
+                              const ITensorInfo             *input_to_forget_weights,
+                              const ITensorInfo             *input_to_cell_weights,
+                              const ITensorInfo             *input_to_output_weights,
+                              const ITensorInfo             *recurrent_to_forget_weights,
+                              const ITensorInfo             *recurrent_to_cell_weights,
+                              const ITensorInfo             *recurrent_to_output_weights,
+                              const ITensorInfo             *forget_gate_bias,
+                              const ITensorInfo             *cell_bias,
+                              const ITensorInfo             *output_gate_bias,
+                              const ITensorInfo             *cell_state_in,
+                              const ITensorInfo             *output_state_in,
+                              const ITensorInfo             *cell_state_out,
+                              const ITensorInfo             *output_state_out,
+                              const ITensorInfo             *output,
                               const LSTMParams<ITensorInfo> &lstm_params)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
-                                        cell_state_out, output_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+                                        cell_state_in, output_state_in, cell_state_out, output_state_out, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -622,13 +743,16 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+                                                   input_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+                                                   recurrent_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QSYMM8);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights, recurrent_to_forget_weights,
+                                                       recurrent_to_cell_weights, recurrent_to_output_weights);
 
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -647,20 +771,25 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
 
     // Check whether peephole weights are all there or none
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                       lstm_params.cell_to_output_weights());
 
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                               lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_input_weights());
         }
     }
 
@@ -674,7 +803,7 @@
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
@@ -682,33 +811,50 @@
     // Precompute effective bias for optimizing the matmul computations.
     const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                               true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.input_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_forget_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_cell_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_output_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                               lstm_params.hidden_state_zero(),
-                                                                               true)));
-        if(lstm_params.projection_bias() != nullptr)
+        ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.projection_weights(), &projection_eff_bias_info,
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+        if (lstm_params.projection_bias() != nullptr)
         {
             ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
-                                                                       &projection_eff_bias_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                               &projection_eff_bias_info, ConvertPolicy::SATURATE));
         }
     }
 
-    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_forget_weights->data_type(), input_to_forget_weights->quantization_info());
-    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1,
+                                              input_to_forget_weights->data_type(),
+                                              input_to_forget_weights->quantization_info());
+    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                  recurrent_to_forget_weights->data_type(),
+                                                  recurrent_to_forget_weights->quantization_info());
 
     // Validate weights transpose
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(input_to_forget_weights, &input_weights_transposed));
@@ -717,15 +863,20 @@
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_forget_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_cell_weights, &recurrent_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(recurrent_to_output_weights, &recurrent_weights_transposed));
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.input_to_input_weights(), &input_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_weights_transposed));
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLTranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -738,28 +889,42 @@
 
     // Forget gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
-    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
-    const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+                                            &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                               &forget_outstage_info, ConvertPolicy::SATURATE));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        const float cell_to_forget_scale = std::pow(2, cell_shift) *
+                                           lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+                                           lstm_params.forget_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                                   &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
         const ITensorInfo *b_info = forget_gate_bias;
@@ -770,20 +935,29 @@
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
 
     const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
-    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+                                      lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_cell_scale, &mm_out_info, &cell_outstage_info));
 
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+                                            &cell_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+                                                               &cell_outstage_info, ConvertPolicy::SATURATE));
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
         const ITensorInfo *b_info = cell_bias;
@@ -791,85 +965,123 @@
     }
 
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+                                        "Input gate bias must not be present when CIFG is used");
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+                                                                      &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+            input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+                                                       lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                                input_to_input_scale, &mm_out_info, &input_outstage_info));
 
-        const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
+        const float recurrent_to_input_scale =
+            lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                                &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+                                                &input_outstage_info));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                   &input_outstage_info, ConvertPolicy::SATURATE));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                            RoundingPolicy::TO_ZERO));
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+                                                    1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            const float cell_to_input_scale = std::pow(2, cell_shift) *
+                                              lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+                                              lstm_params.input_intermediate_scale();
+            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+                cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                       &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
-        if(has_layer_norm)
+        if (has_layer_norm)
         {
             const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
             const ITensorInfo *b_info = lstm_params.input_gate_bias();
             ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(cell_outstage_info, *w_info, *b_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+            &input_outstage_info, &input_gate_info,
+            ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
-    if(quantized_cell_clip > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    if (quantized_cell_clip > 0)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
-                                                                                                             quantized_cell_clip)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            CLActivationLayer::validate(cell_state_out, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            -quantized_cell_clip, quantized_cell_clip)));
     }
     // Output gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_output_scale, &mm_out_info, &output_outstage_info));
 
-    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
+    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+                                            &output_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
-    if(lstm_params.has_peephole_opt())
+    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                               &output_outstage_info, ConvertPolicy::SATURATE));
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+                                                             DataType::QSYMM16);
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplicationKernel
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+            cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                                   &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
         const ITensorInfo *b_info = output_gate_bias;
@@ -877,85 +1089,103 @@
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(&output_outstage_info, &output_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Hidden.
-    ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLActivationLayer::validate(cell_state_out, &input_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
-    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplication::validate(
+        &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = hidden_out_info.data_type();
-    ARM_COMPUTE_RETURN_ON_ERROR(CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
 
     // Projection.
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+                                                           lstm_params.projection_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
-        const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+        const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         gemmlowp_info.gemmlowp_offset    = qoutput_state_in.offset;
         gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
         gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
         const TensorInfo projection_outstage_info(*output_state_out);
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+                                                &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
                                                 &projection_outstage_info));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+                                                                   ConvertPolicy::SATURATE));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                CLQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
             quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                                   quantized_projection_clip)));
+            ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(
+                output_state_out, nullptr,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                    -quantized_projection_clip, quantized_projection_clip)));
         }
     }
     else
     {
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(CLQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
         }
     }
 
-    if(cell_state_out->total_size() > 0)
+    if (cell_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
     }
 
-    if(output_state_out->total_size() > 0)
+    if (output_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -980,14 +1210,14 @@
     _recurrent_to_forget_outstage.run();
     _accumulate_input_recurrent_forget.run();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
         _accumulate_cell_forget.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Forget));
     }
@@ -1002,7 +1232,7 @@
     _recurrent_to_cell_outstage.run();
     _accumulate_input_recurrent_modulation.run();
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Cell));
     }
@@ -1010,7 +1240,7 @@
     _cell_gate_tanh.run();
 
     // Input gate
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _input_gate_sub.run();
     }
@@ -1022,14 +1252,14 @@
         _recurrent_to_input_outstage.run();
         _accumulate_input_recurrent_input.run();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
             _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
             _accumulate_cell_input.run();
         }
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Input));
         }
@@ -1041,7 +1271,7 @@
     _pixelwise_mul_forget_cell.run();
     _pixelwise_mul_input_cell.run();
     _add_forget_cell.run();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
         _cell_clip.run();
     }
@@ -1052,14 +1282,14 @@
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
     _accumulate_input_recurrent_output.run();
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
         _accumulate_cell_to_output.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         CLScheduler::get().enqueue(get_layer_norm(LayerNormGate::Output));
     }
@@ -1072,31 +1302,31 @@
     _hidden_outstage.run();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         _mm_projection.run();
         _projection_outstage.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_output_to_accumulate_copy.run();
         }
 
         _accumulate_projection.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.run();
         }
 
-        if(_has_projection_clipping)
+        if (_has_projection_clipping)
         {
             _projection_clip.run();
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.run();
         }
@@ -1108,7 +1338,7 @@
 
 void CLQLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Pre-transpose weights to be used in GEMM.
         _input_to_forget_weights_transposed.allocator()->allocate();
@@ -1125,10 +1355,11 @@
         _transpose_recurrent_to_output_weights.run();
 
         // Precompute effective biases
-        if(_has_cifg)
+        if (_has_cifg)
         {
             _ones.map(true);
-            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 32767);
             _ones.unmap();
         }
         else
@@ -1136,10 +1367,12 @@
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
 
-            ITensorPack input_to_input_red_pack = { { ACL_SRC, _input_to_input_weights }, { ACL_DST, &_input_to_input_eff_bias } };
+            ITensorPack input_to_input_red_pack = {{ACL_SRC, _input_to_input_weights},
+                                                   {ACL_DST, &_input_to_input_eff_bias}};
             CLScheduler::get().enqueue_op(*_input_to_input_reduction, input_to_input_red_pack, false);
 
-            ITensorPack rec_to_input_red_pack = { { ACL_SRC, _recurrent_to_input_weights }, { ACL_DST, &_recurrent_to_input_eff_bias } };
+            ITensorPack rec_to_input_red_pack = {{ACL_SRC, _recurrent_to_input_weights},
+                                                 {ACL_DST, &_recurrent_to_input_eff_bias}};
             CLScheduler::get().enqueue_op(*_recurrent_to_input_reduction, rec_to_input_red_pack, false);
 
             _input_to_input_weights_transposed.allocator()->allocate();
@@ -1156,30 +1389,35 @@
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
 
-        ITensorPack input_to_forget_red_pack = { { ACL_SRC, _input_to_forget_weights }, { ACL_DST, &_input_to_forget_eff_bias } };
+        ITensorPack input_to_forget_red_pack = {{ACL_SRC, _input_to_forget_weights},
+                                                {ACL_DST, &_input_to_forget_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_forget_reduction, input_to_forget_red_pack, false);
 
-        ITensorPack rec_to_forget_red_pack = { { ACL_SRC, _recurrent_to_forget_weights }, { ACL_DST, &_recurrent_to_forget_eff_bias } };
+        ITensorPack rec_to_forget_red_pack = {{ACL_SRC, _recurrent_to_forget_weights},
+                                              {ACL_DST, &_recurrent_to_forget_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_forget_reduction, rec_to_forget_red_pack, false);
 
-        ITensorPack input_to_cell_red_pack = { { ACL_SRC, _input_to_cell_weights }, { ACL_DST, &_input_to_cell_eff_bias } };
+        ITensorPack input_to_cell_red_pack = {{ACL_SRC, _input_to_cell_weights}, {ACL_DST, &_input_to_cell_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_cell_reduction, input_to_cell_red_pack, false);
 
-        ITensorPack rec_to_cell_red_pack = { { ACL_SRC, _recurrent_to_cell_weights }, { ACL_DST, &_recurrent_to_cell_eff_bias } };
+        ITensorPack rec_to_cell_red_pack = {{ACL_SRC, _recurrent_to_cell_weights},
+                                            {ACL_DST, &_recurrent_to_cell_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_cell_reduction, rec_to_cell_red_pack, false);
 
-        ITensorPack input_to_output_red_pack = { { ACL_SRC, _input_to_output_weights }, { ACL_DST, &_input_to_output_eff_bias } };
+        ITensorPack input_to_output_red_pack = {{ACL_SRC, _input_to_output_weights},
+                                                {ACL_DST, &_input_to_output_eff_bias}};
         CLScheduler::get().enqueue_op(*_input_to_output_reduction, input_to_output_red_pack, false);
 
-        ITensorPack rec_to_output_red_pack = { { ACL_SRC, _recurrent_to_output_weights }, { ACL_DST, &_recurrent_to_output_eff_bias } };
+        ITensorPack rec_to_output_red_pack = {{ACL_SRC, _recurrent_to_output_weights},
+                                              {ACL_DST, &_recurrent_to_output_eff_bias}};
         CLScheduler::get().enqueue_op(*_recurrent_to_output_reduction, rec_to_output_red_pack, false);
 
-        if(_has_projection)
+        if (_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            ITensorPack proj_red_pack{ { ACL_SRC, _projection_weights }, { ACL_DST, &_projection_eff_bias } };
+            ITensorPack proj_red_pack{{ACL_SRC, _projection_weights}, {ACL_DST, &_projection_eff_bias}};
             CLScheduler::get().enqueue_op(*_projection_reduction, proj_red_pack, false);
-            if(_projection_bias != nullptr)
+            if (_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
@@ -1189,7 +1427,7 @@
             _transpose_projection_weights.run();
             _projection_weights->mark_as_unused();
 
-            if(!_projection_tensor_copy_required)
+            if (!_projection_tensor_copy_required)
             {
                 _hidden_gate.mark_as_unused();
                 _projection_accumulate_res.mark_as_unused();

diff --git a/src/runtime/CL/functions/CLQuantizationLayer.cpp b/src/runtime/CL/functions/CLQuantizationLayer.cpp
index b249bdd..6edef29 100644
--- a/src/runtime/CL/functions/CLQuantizationLayer.cpp
+++ b/src/runtime/CL/functions/CLQuantizationLayer.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClQuantize.h"
 
@@ -32,13 +33,12 @@
 {
 struct CLQuantizationLayer::Impl
 {
-    const ICLTensor                    *src{ nullptr };
-    ICLTensor                          *dst{ nullptr };
-    std::unique_ptr<opencl::ClQuantize> op{ nullptr };
+    const ICLTensor                    *src{nullptr};
+    ICLTensor                          *dst{nullptr};
+    std::unique_ptr<opencl::ClQuantize> op{nullptr};
 };
 
-CLQuantizationLayer::CLQuantizationLayer()
-    : _impl(std::make_unique<Impl>())
+CLQuantizationLayer::CLQuantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 CLQuantizationLayer::~CLQuantizationLayer() = default;

diff --git a/src/runtime/CL/functions/CLRNNLayer.cpp b/src/runtime/CL/functions/CLRNNLayer.cpp
index 6f12286..34b78ee 100644
--- a/src/runtime/CL/functions/CLRNNLayer.cpp
+++ b/src/runtime/CL/functions/CLRNNLayer.cpp

@@ -28,24 +28,37 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLFillBorderKernel.h"
 
 namespace arm_compute
 {
 using namespace arm_compute::misc::shape_calculator;
 
 CLRNNLayer::CLRNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_kernel(), _activation(), _fully_connected_kernel(), _copy(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)),
+      _gemm_state_f(),
+      _add_kernel(),
+      _activation(),
+      _fully_connected_kernel(),
+      _copy(),
+      _fully_connected_out(),
+      _gemm_output(),
+      _add_output(),
       _is_prepared(false)
 {
 }
 
 CLRNNLayer::~CLRNNLayer() = default;
 
-Status CLRNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
-                            const ITensorInfo *output, const ActivationLayerInfo &info)
+Status CLRNNLayer::validate(const ITensorInfo         *input,
+                            const ITensorInfo         *weights,
+                            const ITensorInfo         *recurrent_weights,
+                            const ITensorInfo         *bias,
+                            const ITensorInfo         *hidden_state,
+                            const ITensorInfo         *output,
+                            const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -63,28 +76,42 @@
     ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
 
-    auto shape_info = TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+    auto shape_info =
+        TensorInfo(compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, weights, bias, &shape_info));
     ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(hidden_state, recurrent_weights, nullptr, &shape_info, 1.f, 0.f));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(CLActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
 
-void CLRNNLayer::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias, ICLTensor *hidden_state, ICLTensor *output,
+void CLRNNLayer::configure(const ICLTensor     *input,
+                           const ICLTensor     *weights,
+                           const ICLTensor     *recurrent_weights,
+                           const ICLTensor     *bias,
+                           ICLTensor           *hidden_state,
+                           ICLTensor           *output,
                            ActivationLayerInfo &info)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state, output, info);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, recurrent_weights, bias, hidden_state,
+              output, info);
 }
 
-void CLRNNLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *recurrent_weights, const ICLTensor *bias,
-                           ICLTensor *hidden_state,
-                           ICLTensor *output, ActivationLayerInfo &info)
+void CLRNNLayer::configure(const CLCompileContext &compile_context,
+                           const ICLTensor        *input,
+                           const ICLTensor        *weights,
+                           const ICLTensor        *recurrent_weights,
+                           const ICLTensor        *bias,
+                           ICLTensor              *hidden_state,
+                           ICLTensor              *output,
+                           ActivationLayerInfo    &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(CLRNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+                                                    bias->info(), hidden_state->info(), output->info(), info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
 
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
@@ -133,7 +160,7 @@
 
 void CLRNNLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _fully_connected_kernel.prepare();
         _gemm_state_f.prepare();

diff --git a/src/runtime/CL/functions/CLROIAlignLayer.cpp b/src/runtime/CL/functions/CLROIAlignLayer.cpp
index 867ef7c..1939d1d 100644
--- a/src/runtime/CL/functions/CLROIAlignLayer.cpp
+++ b/src/runtime/CL/functions/CLROIAlignLayer.cpp

@@ -24,26 +24,36 @@
 #include "arm_compute/runtime/CL/functions/CLROIAlignLayer.h"
 
 #include "arm_compute/core/CL/ICLArray.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLROIAlignLayerKernel.h"
 #include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
-Status CLROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIAlignLayer::validate(const ITensorInfo         *input,
+                                 const ITensorInfo         *rois,
+                                 ITensorInfo               *output,
+                                 const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(CLROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void CLROIAlignLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIAlignLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIAlignLayer::configure(const CLCompileContext    &compile_context,
+                                const ICLTensor           *input,
+                                const ICLTensor           *rois,
+                                ICLTensor                 *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 

diff --git a/src/runtime/CL/functions/CLROIPoolingLayer.cpp b/src/runtime/CL/functions/CLROIPoolingLayer.cpp
index 239a1c6..0d2eab0 100644
--- a/src/runtime/CL/functions/CLROIPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLROIPoolingLayer.cpp

@@ -22,24 +22,35 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLROIPoolingLayer.h"
+
 #include "arm_compute/core/CL/ICLArray.h"
-#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLROIPoolingLayerKernel.h"
 
 using namespace arm_compute;
 
-Status CLROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status CLROIPoolingLayer::validate(const ITensorInfo         *input,
+                                   const ITensorInfo         *rois,
+                                   ITensorInfo               *output,
+                                   const ROIPoolingLayerInfo &pool_info)
 {
     return CLROIPoolingLayerKernel::validate(input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayer::configure(const ICLTensor *input, const ICLTensor *rois, ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  ICLTensor                 *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, rois, output, pool_info);
 }
 
-void CLROIPoolingLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *rois, const ICLTensor *output, const ROIPoolingLayerInfo &pool_info)
+void CLROIPoolingLayer::configure(const CLCompileContext    &compile_context,
+                                  const ICLTensor           *input,
+                                  const ICLTensor           *rois,
+                                  const ICLTensor           *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 

diff --git a/src/runtime/CL/functions/CLRange.cpp b/src/runtime/CL/functions/CLRange.cpp
index 3fbbd5f..5c3f7f9 100644
--- a/src/runtime/CL/functions/CLRange.cpp
+++ b/src/runtime/CL/functions/CLRange.cpp

@@ -27,9 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLRangeKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLRangeKernel.h"
 
 using namespace arm_compute;
 
@@ -38,7 +38,8 @@
     configure(CLKernelLibrary::get().get_compile_context(), output, start, end, step);
 }
 
-void CLRange::configure(const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
+void CLRange::configure(
+    const CLCompileContext &compile_context, ICLTensor *output, const float start, const float end, const float step)
 {
     ARM_COMPUTE_LOG_PARAMS(output, start, end, step);
     auto k = std::make_unique<CLRangeKernel>();

diff --git a/src/runtime/CL/functions/CLReduceMean.cpp b/src/runtime/CL/functions/CLReduceMean.cpp
index cddbf77..6c6daff 100644
--- a/src/runtime/CL/functions/CLReduceMean.cpp
+++ b/src/runtime/CL/functions/CLReduceMean.cpp

@@ -27,23 +27,25 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/CL/kernels/CLFillBorderKernel.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 namespace
 {
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
@@ -51,29 +53,29 @@
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
         //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         // Only validate if not using auto_init for the output tensor
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(unsigned int i = 0; i < reduction_ops; ++i)
+        for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
             ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-            if(output->total_size() > 0 && keep_dims)
+            if (output->total_size() > 0 && keep_dims)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
             }
-            if(keep_dims)
+            if (keep_dims)
             {
                 out_shape.set(axis_local[i], 1);
             }
@@ -87,8 +89,9 @@
         }
         const TensorInfo out_info = input->clone()->set_tensor_shape(out_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &out_info);
-        const bool requant = is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
-        if(requant)
+        const bool requant =
+            is_data_type_quantized(input->data_type()) && input->quantization_info() != output->quantization_info();
+        if (requant)
         {
             TensorInfo input_no_quant(input->clone()->set_data_type(DataType::F32));
             CLDequantizationLayer::validate(input, &input_no_quant);
@@ -98,10 +101,19 @@
     }
     return Status{};
 }
-}
+} // namespace
 
 CLReduceMean::CLReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _dequant(), _requant(), _reduction_ops(), _keep_dims(), _do_requant(), _input_no_quant(),
+    : _memory_group(std::move(memory_manager)),
+      _reduction_kernels(),
+      _reduced_outs(),
+      _reshape(),
+      _dequant(),
+      _requant(),
+      _reduction_ops(),
+      _keep_dims(),
+      _do_requant(),
+      _input_no_quant(),
       _output_no_quant()
 {
 }
@@ -111,17 +123,23 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, reduction_axis, keep_dims, output);
 }
 
-void CLReduceMean::configure(const CLCompileContext &compile_context, ICLTensor *input, const Coordinates &reduction_axis, bool keep_dims, ICLTensor *output)
+void CLReduceMean::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             const Coordinates      &reduction_axis,
+                             bool                    keep_dims,
+                             ICLTensor              *output)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(CLReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     ARM_COMPUTE_LOG_PARAMS(input, reduction_axis, keep_dims, output);
 
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
-    _do_requant    = is_data_type_quantized(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info();
+    _do_requant = is_data_type_quantized(input->info()->data_type()) &&
+                  input->info()->quantization_info() != output->info()->quantization_info();
     _reduction_ops = reduction_axis.num_dimensions();
     _reduction_kernels.resize(_reduction_ops);
     _reduced_outs.resize(_reduction_ops - (keep_dims ? 1 : 0));
@@ -129,7 +147,7 @@
 
     ICLTensor *tmp_input  = input;
     ICLTensor *tmp_output = output;
-    if(_do_requant)
+    if (_do_requant)
     {
         _memory_group.manage(&_input_no_quant);
         _memory_group.manage(&_output_no_quant);
@@ -148,46 +166,51 @@
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(int i = 0; i < _reduction_ops; ++i)
+    for (int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape =
+            i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
-        if(i == _reduction_ops - 1 && keep_dims)
+        if (i == _reduction_ops - 1 && keep_dims)
         {
-            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, tmp_output, axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(), tmp_input->info()->data_type(), tmp_input->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_input->info()->num_channels(),
+                                                          tmp_input->info()->data_type(),
+                                                          tmp_input->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
-            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
+            _reduction_kernels[i].configure(compile_context, in, &_reduced_outs[i], axis_local[i],
+                                            ReductionOperation::MEAN_SUM);
         }
     }
 
     // Allocate intermediate tensors
-    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
 
     // Configure reshape layer if we want to drop the dimensions
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         TensorShape out_shape = tmp_input->info()->tensor_shape();
 
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(int i = 0; i < _reduction_ops; ++i)
+        for (int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
         }
         auto_init_if_empty(*tmp_output->info(), tmp_input->info()->clone()->set_tensor_shape(out_shape));
         _reshape.configure(compile_context, &_reduced_outs[_reduction_ops - 1], tmp_output);
     }
-    if(_do_requant)
+    if (_do_requant)
     {
         _requant.configure(compile_context, &_output_no_quant, output);
         _input_no_quant.allocator()->allocate();
@@ -195,7 +218,10 @@
     }
 }
 
-Status CLReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status CLReduceMean::validate(const ITensorInfo *input,
+                              const Coordinates &reduction_axis,
+                              bool               keep_dims,
+                              const ITensorInfo *output)
 {
     return validate_config(input, reduction_axis, keep_dims, output);
 }
@@ -204,19 +230,19 @@
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_do_requant)
+    if (_do_requant)
     {
         _dequant.run();
     }
-    for(auto &kernel : _reduction_kernels)
+    for (auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         _reshape.run();
     }
-    if(_do_requant)
+    if (_do_requant)
     {
         _requant.run();
     }

diff --git a/src/runtime/CL/functions/CLReductionOperation.cpp b/src/runtime/CL/functions/CLReductionOperation.cpp
index cdc7fec..ba54890 100644
--- a/src/runtime/CL/functions/CLReductionOperation.cpp
+++ b/src/runtime/CL/functions/CLReductionOperation.cpp

@@ -27,35 +27,43 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
+#include "src/common/utils/Log.h"
 #include "src/core/CL/kernels/CLReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/runtime/Utils.h"
 
-#include "src/common/utils/Log.h"
-
 namespace arm_compute
 {
 CLReductionOperation::CLReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _unreshaped_output(), _reduction_kernel(), _reshape(), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(std::move(memory_manager)),
+      _unreshaped_output(),
+      _reduction_kernel(),
+      _reshape(),
+      _reduction_axis(),
+      _is_reshape_required(false)
 {
 }
 
 CLReductionOperation::~CLReductionOperation() = default;
 
-Status CLReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status CLReductionOperation::validate(
+    const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     const bool is_reshape_required = !keep_dims;
 
-    if(is_reshape_required && output->total_size() != 0)
+    if (is_reshape_required && output->total_size() != 0)
     {
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
     }
 
@@ -67,22 +75,23 @@
     const auto input_qinfo        = input->quantization_info();
     const auto output_data_type   = output->data_type();
 
-    auto initialize_tensorinfo = [](TensorInfo & ti, TensorShape shape, DataType data_type, int num_channels, QuantizationInfo qinfo)
-    {
+    auto initialize_tensorinfo = [](TensorInfo &ti, TensorShape shape, DataType data_type, int num_channels,
+                                    QuantizationInfo qinfo) {
         ti.set_data_type(data_type).set_tensor_shape(shape).set_num_channels(num_channels).set_quantization_info(qinfo);
     };
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         auto shape_before_reshape = input_shape;
         shape_before_reshape.set(axis, 1);
-        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles, input_qinfo);
+        initialize_tensorinfo(output_before_reshape, shape_before_reshape, output_data_type, input_num_channles,
+                              input_qinfo);
         output_internal = &output_before_reshape;
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(CLReductionOperationKernel::validate(input, output_internal, axis, op));
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(CLReshapeLayer::validate(output_internal, output));
     }
@@ -92,7 +101,7 @@
 
 ICLTensor *CLReductionOperation::configure_intermediate_result_vector(ICLTensor *input, ICLTensor *output)
 {
-    if(!_is_reshape_required)
+    if (!_is_reshape_required)
     {
         return output;
     }
@@ -103,12 +112,18 @@
     return &_unreshaped_output;
 }
 
-void CLReductionOperation::configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(
+    ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis, op, keep_dims);
 }
 
-void CLReductionOperation::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void CLReductionOperation::configure(const CLCompileContext &compile_context,
+                                     ICLTensor              *input,
+                                     ICLTensor              *output,
+                                     unsigned int            axis,
+                                     ReductionOperation      op,
+                                     bool                    keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
@@ -117,11 +132,17 @@
 
     auto *output_internal = configure_intermediate_result_vector(input, output);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        const TensorShape output_shape     = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-        const auto        output_data_type = input->info()->data_type();
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape).set_data_type(output_data_type).reset_padding().set_is_resizable(true));
+        const TensorShape output_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type = input->info()->data_type();
+        auto_init_if_empty(*output->info(), input->info()
+                                                ->clone()
+                                                ->set_tensor_shape(output_shape)
+                                                .set_data_type(output_data_type)
+                                                .reset_padding()
+                                                .set_is_resizable(true));
 
         _memory_group.manage(&_unreshaped_output);
     }
@@ -129,7 +150,7 @@
     _reduction_kernel = std::make_unique<CLReductionOperationKernel>();
     _reduction_kernel->configure(compile_context, input, output_internal, axis, op);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.configure(compile_context, &_unreshaped_output, output);
         _unreshaped_output.allocator()->allocate();
@@ -142,7 +163,7 @@
 
     CLScheduler::get().enqueue(*_reduction_kernel, false);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.run();
     }

diff --git a/src/runtime/CL/functions/CLReorgLayer.cpp b/src/runtime/CL/functions/CLReorgLayer.cpp
index 15de959..156e9b9 100644
--- a/src/runtime/CL/functions/CLReorgLayer.cpp
+++ b/src/runtime/CL/functions/CLReorgLayer.cpp

@@ -27,9 +27,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReorgLayerKernel.h"
 
 #include <utility>
 
@@ -40,7 +40,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, stride);
 }
 
-void CLReorgLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, int32_t stride)
+void CLReorgLayer::configure(const CLCompileContext &compile_context,
+                             ICLTensor              *input,
+                             ICLTensor              *output,
+                             int32_t                 stride)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, stride);
     auto k = std::make_unique<CLReorgLayerKernel>();

diff --git a/src/runtime/CL/functions/CLReshapeLayer.cpp b/src/runtime/CL/functions/CLReshapeLayer.cpp
index c51a329..3d6349f 100644
--- a/src/runtime/CL/functions/CLReshapeLayer.cpp
+++ b/src/runtime/CL/functions/CLReshapeLayer.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClReshape.h"
 
@@ -35,17 +36,16 @@
 {
 struct CLReshapeLayer::Impl
 {
-    const ICLTensor                   *src{ nullptr };
-    ICLTensor                         *dst{ nullptr };
-    std::unique_ptr<opencl::ClReshape> op{ nullptr };
+    const ICLTensor                   *src{nullptr};
+    ICLTensor                         *dst{nullptr};
+    std::unique_ptr<opencl::ClReshape> op{nullptr};
 };
 
-CLReshapeLayer::CLReshapeLayer()
-    : _impl(std::make_unique<Impl>())
+CLReshapeLayer::CLReshapeLayer() : _impl(std::make_unique<Impl>())
 {
 }
 
-CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&) = default;
+CLReshapeLayer::CLReshapeLayer(CLReshapeLayer &&)            = default;
 CLReshapeLayer &CLReshapeLayer::operator=(CLReshapeLayer &&) = default;
 CLReshapeLayer::~CLReshapeLayer()                            = default;
 
@@ -78,4 +78,4 @@
     _impl->op->run(pack);
 }
 } // namespace arm_compute
-/** [CLReshapeLayer snippet] **/
\ No newline at end of file
+  /** [CLReshapeLayer snippet] **/

diff --git a/src/runtime/CL/functions/CLReverse.cpp b/src/runtime/CL/functions/CLReverse.cpp
index 1fc9357..415de52 100644
--- a/src/runtime/CL/functions/CLReverse.cpp
+++ b/src/runtime/CL/functions/CLReverse.cpp

@@ -24,9 +24,9 @@
 #include "arm_compute/runtime/CL/functions/CLReverse.h"
 
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLReverseKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLReverseKernel.h"
 
 namespace arm_compute
 {
@@ -35,7 +35,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, axis);
 }
 
-void CLReverse::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const ICLTensor *axis)
+void CLReverse::configure(const CLCompileContext &compile_context,
+                          const ICLTensor        *input,
+                          ICLTensor              *output,
+                          const ICLTensor        *axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, axis);
     auto k = std::make_unique<CLReverseKernel>();

diff --git a/src/runtime/CL/functions/CLScale.cpp b/src/runtime/CL/functions/CLScale.cpp
index 5b78989..abff072 100644
--- a/src/runtime/CL/functions/CLScale.cpp
+++ b/src/runtime/CL/functions/CLScale.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClScale.h"
 
@@ -33,13 +34,12 @@
 {
 struct CLScale::Impl
 {
-    const ICLTensor                 *src{ nullptr };
-    ICLTensor                       *dst{ nullptr };
-    std::unique_ptr<opencl::ClScale> op{ nullptr };
+    const ICLTensor                 *src{nullptr};
+    ICLTensor                       *dst{nullptr};
+    std::unique_ptr<opencl::ClScale> op{nullptr};
 };
 
-CLScale::CLScale()
-    : _impl(std::make_unique<Impl>())
+CLScale::CLScale() : _impl(std::make_unique<Impl>())
 {
 }
 CLScale::~CLScale() = default;
@@ -49,7 +49,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, info);
 }
 
-void CLScale::configure(const CLCompileContext &compile_context, ICLTensor *input, ICLTensor *output, const ScaleKernelInfo &info)
+void CLScale::configure(const CLCompileContext &compile_context,
+                        ICLTensor              *input,
+                        ICLTensor              *output,
+                        const ScaleKernelInfo  &info)
 {
     _impl->src = input;
     _impl->dst = output;

diff --git a/src/runtime/CL/functions/CLSelect.cpp b/src/runtime/CL/functions/CLSelect.cpp
index c4ab3dc..b4897d9 100644
--- a/src/runtime/CL/functions/CLSelect.cpp
+++ b/src/runtime/CL/functions/CLSelect.cpp

@@ -25,9 +25,9 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSelectKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSelectKernel.h"
 
 using namespace arm_compute;
 
@@ -38,7 +38,11 @@
     configure(CLKernelLibrary::get().get_compile_context(), c, x, y, output);
 }
 
-void CLSelect::configure(const CLCompileContext &compile_context, const ICLTensor *c, const ICLTensor *x, const ICLTensor *y, ICLTensor *output)
+void CLSelect::configure(const CLCompileContext &compile_context,
+                         const ICLTensor        *c,
+                         const ICLTensor        *x,
+                         const ICLTensor        *y,
+                         ICLTensor              *output)
 {
     ARM_COMPUTE_LOG_PARAMS(c, x, y, output);
     auto k = std::make_unique<CLSelectKernel>();

diff --git a/src/runtime/CL/functions/CLSlice.cpp b/src/runtime/CL/functions/CLSlice.cpp
index 7e3ac7d..f79c6a1 100644
--- a/src/runtime/CL/functions/CLSlice.cpp
+++ b/src/runtime/CL/functions/CLSlice.cpp

@@ -26,15 +26,19 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
-void CLSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ITensorInfo      *input,
+                        ITensorInfo            *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
@@ -47,15 +51,16 @@
     _kernel = std::move(k);
 }
 
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
     // Check start dimensions for being non-negative
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
-    {
-        return i < 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -66,20 +71,22 @@
 
 struct CLSlice::Impl
 {
-    const ICLTensor                       *src{ nullptr };
-    ICLTensor                             *dst{ nullptr };
-    std::unique_ptr<experimental::CLSlice> op{ nullptr };
+    const ICLTensor                       *src{nullptr};
+    ICLTensor                             *dst{nullptr};
+    std::unique_ptr<experimental::CLSlice> op{nullptr};
 };
 
-CLSlice::CLSlice()
-    : _impl(std::make_unique<Impl>())
+CLSlice::CLSlice() : _impl(std::make_unique<Impl>())
 {
 }
-CLSlice::CLSlice(CLSlice &&) = default;
+CLSlice::CLSlice(CLSlice &&)            = default;
 CLSlice &CLSlice::operator=(CLSlice &&) = default;
 CLSlice::~CLSlice()                     = default;
 
-Status CLSlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status CLSlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     return experimental::CLSlice::validate(input, output, starts, ends);
 }
@@ -89,7 +96,11 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends);
 }
 
-void CLSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
+void CLSlice::configure(const CLCompileContext &compile_context,
+                        const ICLTensor        *input,
+                        ICLTensor              *output,
+                        const Coordinates      &starts,
+                        const Coordinates      &ends)
 {
     _impl->src = input;
     _impl->dst = output;

diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index d52352f..2e70e2a 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp

@@ -22,12 +22,14 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/CL/functions/CLSoftmaxLayer.h"
+
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Utils.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
 #include "src/gpu/cl/operators/ClPermute.h"
@@ -40,9 +42,9 @@
 template <bool IS_LOG>
 struct CLSoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ICLTensor              *src{ nullptr };
-    ICLTensor                    *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ICLTensor              *src{nullptr};
+    ICLTensor                    *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
     MemoryGroup                   memory_group{};
     ITensorPack                   run_pack{};
     WorkspaceData<CLTensor>       workspace_tensors{};
@@ -65,28 +67,30 @@
 }
 
 template <bool IS_LOG>
-void CLSoftmaxLayerGeneric<IS_LOG>::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
+void CLSoftmaxLayerGeneric<IS_LOG>::configure(
+    const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, float beta, int32_t axis)
 {
     _impl->src = input;
     _impl->dst = output;
     _impl->op  = std::make_unique<OperatorType>();
 
-    SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->info()->data_type(), axis };
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->info()->data_type(), axis};
     _impl->op->configure(compile_context, *input->info(), *output->info(), softmax_info);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
 template <bool IS_LOG>
-Status CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+CLSoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
-    SoftmaxKernelInfo softmax_info{ beta, IS_LOG, input->data_type(), axis };
+    SoftmaxKernelInfo softmax_info{beta, IS_LOG, input->data_type(), axis};
     return OperatorType::validate(*input, *output, softmax_info);
 }
 
 template <bool IS_LOG>
-void           CLSoftmaxLayerGeneric<IS_LOG>::run()
+void CLSoftmaxLayerGeneric<IS_LOG>::run()
 {
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_impl->memory_group);

diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 3b70834..37f7288 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp

@@ -29,71 +29,100 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()),
-      _fill(),
-      _has_padding(false)
+    : _space_to_batch_kernel(std::make_unique<CLSpaceToBatchLayerKernel>()), _fill(), _has_padding(false)
 {
 }
 
 CLSpaceToBatchLayer::~CLSpaceToBatchLayer() = default;
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const ICLTensor *block_shape,
+                                    const ICLTensor *paddings,
+                                    ICLTensor       *output)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const ICLTensor        *block_shape,
+                                    const ICLTensor        *paddings,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
     _space_to_batch_kernel->configure(compile_context, input, block_shape, paddings, output);
 }
 
-void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const ICLTensor *input,
+                                    const int        block_shape_x,
+                                    const int        block_shape_y,
+                                    const Size2D    &padding_left,
+                                    const Size2D    &padding_right,
+                                    ICLTensor       *output)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    configure(CLKernelLibrary::get().get_compile_context(), input, block_shape_x, block_shape_y, padding_left,
+              padding_right, output);
 }
 
-void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left,
-                                    const Size2D &padding_right, ICLTensor *output)
+void CLSpaceToBatchLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    const int               block_shape_x,
+                                    const int               block_shape_y,
+                                    const Size2D           &padding_left,
+                                    const Size2D           &padding_right,
+                                    ICLTensor              *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
-        _fill.configure(compile_context, output, PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
+        _fill.configure(compile_context, output,
+                        PixelValue(0, input->info()->data_type(), input->info()->quantization_info()));
     }
-    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    _space_to_batch_kernel->configure(compile_context, input, block_shape_x, block_shape_y, padding_left, padding_right,
+                                      output);
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const ITensorInfo *block_shape,
+                                     const ITensorInfo *paddings,
+                                     const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
     ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
 }
 
-Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status CLSpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const int          block_shape_x,
+                                     const int          block_shape_y,
+                                     const Size2D      &padding_left,
+                                     const Size2D      &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
-    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLFill::validate(output, PixelValue(0, input->data_type(), input->quantization_info())));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
 }
@@ -101,7 +130,7 @@
 void CLSpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    if(_has_padding)
+    if (_has_padding)
     {
         //CLScheduler::get().enqueue(*_fill, true);
         _fill.run();

diff --git a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
index 67dafff..22695c9 100644
--- a/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToDepthLayer.cpp

@@ -29,14 +29,13 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLSpaceToDepthLayerKernel.h"
 
 namespace arm_compute
 {
-CLSpaceToDepthLayer::CLSpaceToDepthLayer()
-    : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
+CLSpaceToDepthLayer::CLSpaceToDepthLayer() : _space_to_depth_kernel(std::make_unique<CLSpaceToDepthLayerKernel>())
 {
 }
 
@@ -47,7 +46,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, block_shape);
 }
 
-void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, int32_t block_shape)
+void CLSpaceToDepthLayer::configure(const CLCompileContext &compile_context,
+                                    const ICLTensor        *input,
+                                    ICLTensor              *output,
+                                    int32_t                 block_shape)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, block_shape);
     _space_to_depth_kernel->configure(compile_context, input, output, block_shape);

diff --git a/src/runtime/CL/functions/CLSplit.cpp b/src/runtime/CL/functions/CLSplit.cpp
index 0b27371..6be43cc 100644
--- a/src/runtime/CL/functions/CLSplit.cpp
+++ b/src/runtime/CL/functions/CLSplit.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
@@ -38,7 +39,7 @@
 {
     cl::CommandQueue q = CLScheduler::get().queue();
 
-    for(unsigned i = 0; i < _num_outputs; ++i)
+    for (unsigned i = 0; i < _num_outputs; ++i)
     {
         _slice_functions[i].run();
     }

diff --git a/src/runtime/CL/functions/CLStackLayer.cpp b/src/runtime/CL/functions/CLStackLayer.cpp
index 6a335da..c15496f 100644
--- a/src/runtime/CL/functions/CLStackLayer.cpp
+++ b/src/runtime/CL/functions/CLStackLayer.cpp

@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <complex>
-
 #include "arm_compute/runtime/CL/functions/CLStackLayer.h"
 
 #include "arm_compute/core/CL/ICLTensor.h"
@@ -32,16 +30,16 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
-#include "src/core/CL/kernels/CLStackLayerKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStackLayerKernel.h"
+
+#include <complex>
 
 namespace arm_compute
 {
 CLStackLayer::CLStackLayer() // NOLINT
-    : _input(),
-      _stack_kernels(),
-      _num_inputs(0)
+    : _input(), _stack_kernels(), _num_inputs(0)
 {
 }
 
@@ -52,7 +50,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, axis, output);
 }
 
-void CLStackLayer::configure(const CLCompileContext &compile_context, const std::vector<ICLTensor *> &input, int axis, ICLTensor *output)
+void CLStackLayer::configure(const CLCompileContext         &compile_context,
+                             const std::vector<ICLTensor *> &input,
+                             int                             axis,
+                             ICLTensor                      *output)
 {
     ARM_COMPUTE_LOG_PARAMS(input, axis, output);
     _num_inputs = input.size();
@@ -61,7 +62,7 @@
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for (unsigned int i = 0; i < _num_inputs; i++)
     {
         _stack_kernels.emplace_back(std::make_unique<CLStackLayerKernel>());
         _stack_kernels.back()->configure(compile_context, input[i], axis_u, i, _num_inputs, output);
@@ -79,7 +80,7 @@
 
     const unsigned int num_inputs = input.size();
 
-    for(unsigned int i = 0; i < num_inputs; i++)
+    for (unsigned int i = 0; i < num_inputs; i++)
     {
         // All the tensors must have the same rank
         ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
@@ -92,7 +93,7 @@
 
 void CLStackLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    for (unsigned i = 0; i < _num_inputs; i++)
     {
         CLScheduler::get().enqueue(*_stack_kernels[i], false);
     }

diff --git a/src/runtime/CL/functions/CLStridedSlice.cpp b/src/runtime/CL/functions/CLStridedSlice.cpp
index 261bdc1..c1953cc 100644
--- a/src/runtime/CL/functions/CLStridedSlice.cpp
+++ b/src/runtime/CL/functions/CLStridedSlice.cpp

@@ -25,17 +25,23 @@
 
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
-#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLStridedSliceKernel.h"
 
 namespace arm_compute
 {
 namespace experimental
 {
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ITensorInfo      *input,
+                               ITensorInfo            *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
     auto k = std::make_unique<CLStridedSliceKernel>();
@@ -43,9 +49,14 @@
     _kernel = std::move(k);
 }
 
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
     return CLStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
@@ -53,32 +64,43 @@
 
 struct CLStridedSlice::Impl
 {
-    const ICLTensor                              *src{ nullptr };
-    ICLTensor                                    *dst{ nullptr };
-    CLRuntimeContext                             *ctx{ nullptr };
-    std::unique_ptr<experimental::CLStridedSlice> op{ nullptr };
+    const ICLTensor                              *src{nullptr};
+    ICLTensor                                    *dst{nullptr};
+    CLRuntimeContext                             *ctx{nullptr};
+    std::unique_ptr<experimental::CLStridedSlice> op{nullptr};
 };
 
-CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+CLStridedSlice::CLStridedSlice(CLRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
 
-CLStridedSlice::CLStridedSlice(CLStridedSlice &&) = default;
+CLStridedSlice::CLStridedSlice(CLStridedSlice &&)            = default;
 CLStridedSlice &CLStridedSlice::operator=(CLStridedSlice &&) = default;
 CLStridedSlice::~CLStridedSlice()                            = default;
 
-void CLStridedSlice::configure(const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const ICLTensor   *input,
+                               ICLTensor         *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    configure(CLKernelLibrary::get().get_compile_context(), input, output, starts, ends, strides, begin_mask, end_mask,
+              shrink_axis_mask);
 }
 
-void CLStridedSlice::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void CLStridedSlice::configure(const CLCompileContext &compile_context,
+                               const ICLTensor        *input,
+                               ICLTensor              *output,
+                               const Coordinates      &starts,
+                               const Coordinates      &ends,
+                               const BiStrides        &strides,
+                               int32_t                 begin_mask,
+                               int32_t                 end_mask,
+                               int32_t                 shrink_axis_mask)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
 
@@ -86,14 +108,21 @@
     _impl->dst = output;
 
     _impl->op = std::make_unique<experimental::CLStridedSlice>();
-    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    _impl->op->configure(compile_context, _impl->src->info(), _impl->dst->info(), starts, ends, strides, begin_mask,
+                         end_mask, shrink_axis_mask);
 }
 
-Status CLStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status CLStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
-    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return experimental::CLStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+                                                  shrink_axis_mask);
 }
 
 void CLStridedSlice::run()

diff --git a/src/runtime/CL/functions/CLTile.cpp b/src/runtime/CL/functions/CLTile.cpp
index ef79099..4f86c4a 100644
--- a/src/runtime/CL/functions/CLTile.cpp
+++ b/src/runtime/CL/functions/CLTile.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/CL/functions/CLTile.h"
 
-#include "src/core/CL/kernels/CLTileKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/CL/kernels/CLTileKernel.h"
 
 namespace arm_compute
 {
@@ -34,7 +33,10 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output, multiples);
 }
 
-void CLTile::configure(const CLCompileContext &compile_context, const ICLTensor *input, ICLTensor *output, const Multiples &multiples)
+void CLTile::configure(const CLCompileContext &compile_context,
+                       const ICLTensor        *input,
+                       ICLTensor              *output,
+                       const Multiples        &multiples)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, multiples);
     auto k = std::make_unique<CLTileKernel>();

diff --git a/src/runtime/CL/functions/CLTranspose.cpp b/src/runtime/CL/functions/CLTranspose.cpp
index e63c92e..5a738f4 100644
--- a/src/runtime/CL/functions/CLTranspose.cpp
+++ b/src/runtime/CL/functions/CLTranspose.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/gpu/cl/operators/ClTranspose.h"
 
@@ -34,12 +35,11 @@
 {
 struct CLTranspose::Impl
 {
-    const ICLTensor                     *src{ nullptr };
-    ICLTensor                           *dst{ nullptr };
-    std::unique_ptr<opencl::ClTranspose> op{ nullptr };
+    const ICLTensor                     *src{nullptr};
+    ICLTensor                           *dst{nullptr};
+    std::unique_ptr<opencl::ClTranspose> op{nullptr};
 };
-CLTranspose::CLTranspose()
-    : _impl(std::make_unique<Impl>())
+CLTranspose::CLTranspose() : _impl(std::make_unique<Impl>())
 {
 }
 CLTranspose::~CLTranspose() = default;
@@ -70,4 +70,4 @@
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/functions/CLUnstack.cpp b/src/runtime/CL/functions/CLUnstack.cpp
index 98d4781..ddd83e7 100644
--- a/src/runtime/CL/functions/CLUnstack.cpp
+++ b/src/runtime/CL/functions/CLUnstack.cpp

@@ -40,13 +40,15 @@
     return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
 }
 
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates       &slice_start,
+                                             int32_t           &slice_end_mask,
+                                             const unsigned int input_num_dimensions)
 {
     // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
     Coordinates slice_end;
     slice_start.set_num_dimensions(input_num_dimensions);
     slice_end.set_num_dimensions(input_num_dimensions);
-    for(size_t k = 0; k < input_num_dimensions; ++k)
+    for (size_t k = 0; k < input_num_dimensions; ++k)
     {
         slice_start.set(k, 0);
         slice_end.set(k, -1);
@@ -56,8 +58,7 @@
 } // namespace
 
 CLUnstack::CLUnstack() // NOLINT
-    : _num_slices(0),
-      _strided_slice_vector()
+    : _num_slices(0), _strided_slice_vector()
 {
 }
 
@@ -66,15 +67,19 @@
     configure(CLKernelLibrary::get().get_compile_context(), input, output_vector, axis);
 }
 
-void CLUnstack::configure(const CLCompileContext &compile_context, const ICLTensor *input, const std::vector<ICLTensor *> &output_vector, int axis)
+void CLUnstack::configure(const CLCompileContext         &compile_context,
+                          const ICLTensor                *input,
+                          const std::vector<ICLTensor *> &output_vector,
+                          int                             axis)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output_vector, axis);
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
-    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ICLTensor * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+                   [](ICLTensor *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t->info();
+                   });
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(CLUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -87,11 +92,12 @@
     Coordinates slice_start;
     int32_t     slice_end_mask;
     setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
-    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    for (unsigned int slice = 0; slice < _num_slices; ++slice)
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(compile_context, input, output_vector[slice], slice_start, Coordinates(),
+                                               BiStrides(), 0, slice_end_mask, (1 << axis_u));
     }
 }
 
@@ -106,18 +112,20 @@
     ARM_COMPUTE_RETURN_ERROR_ON(num_slices > output_vector.size());
     Coordinates slice_start;
     int32_t     slice_end_mask;
-    for(size_t k = 0; k < num_slices; ++k)
+    for (size_t k = 0; k < num_slices; ++k)
     {
         slice_start.set(wrap_axis(axis, input), k);
         setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
-        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+        ARM_COMPUTE_RETURN_ON_ERROR(CLStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+                                                             BiStrides(), 0, slice_end_mask,
+                                                             (1 << wrap_axis(axis, input))));
     }
     return Status{};
 }
 
 void CLUnstack::run()
 {
-    for(unsigned i = 0; i < _num_slices; ++i)
+    for (unsigned i = 0; i < _num_slices; ++i)
     {
         _strided_slice_vector[i].run();
     }

diff --git a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
index b416d0f..645f817 100644
--- a/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLWinogradConvolutionLayer.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/KernelDescriptors.h"
+
 #include "src/core/CL/ICLKernel.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/gpu/cl/operators/ClWinogradConv2d.h"
@@ -35,15 +36,15 @@
 {
 struct CLWinogradConvolutionLayer::Impl
 {
-    const ICLTensor                          *src{ nullptr };
-    const ICLTensor                          *weights{ nullptr };
-    const ICLTensor                          *biases{ nullptr };
-    ICLTensor                                *dst{ nullptr };
-    std::unique_ptr<opencl::ClWinogradConv2d> op{ nullptr };
+    const ICLTensor                          *src{nullptr};
+    const ICLTensor                          *weights{nullptr};
+    const ICLTensor                          *biases{nullptr};
+    ICLTensor                                *dst{nullptr};
+    std::unique_ptr<opencl::ClWinogradConv2d> op{nullptr};
     ITensorPack                               run_pack{};
     MemoryGroup                               memory_group{};
     WorkspaceData<CLTensor>                   workspace_tensors{};
-    bool                                      is_prepared{ false };
+    bool                                      is_prepared{false};
 };
 
 CLWinogradConvolutionLayer::CLWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -54,15 +55,26 @@
 
 CLWinogradConvolutionLayer::~CLWinogradConvolutionLayer() = default;
 
-void CLWinogradConvolutionLayer::configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
-                                           bool enable_fast_math)
+void CLWinogradConvolutionLayer::configure(ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
-    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info, enable_fast_math);
+    configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info, act_info,
+              enable_fast_math);
 }
 
-void CLWinogradConvolutionLayer::configure(const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
+void CLWinogradConvolutionLayer::configure(const CLCompileContext    &compile_context,
+                                           ICLTensor                 *input,
+                                           const ICLTensor           *weights,
+                                           const ICLTensor           *biases,
+                                           ICLTensor                 *output,
                                            const PadStrideInfo       &conv_info,
-                                           const ActivationLayerInfo &act_info, bool enable_fast_math)
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
     _impl->src     = input;
     _impl->weights = weights;
@@ -70,20 +82,25 @@
     _impl->dst     = output;
 
     _impl->op = std::make_unique<opencl::ClWinogradConv2d>();
-    _impl->op->configure(compile_context, input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info, enable_fast_math);
+    _impl->op->configure(compile_context, input->info(), weights->info(),
+                         (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, act_info,
+                         enable_fast_math);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, _impl->src },
-        { TensorType::ACL_SRC_1, _impl->weights },
-        { TensorType::ACL_SRC_2, _impl->biases },
-        { TensorType::ACL_DST, _impl->dst }
-    };
-    _impl->workspace_tensors = manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack = {{TensorType::ACL_SRC_0, _impl->src},
+                       {TensorType::ACL_SRC_1, _impl->weights},
+                       {TensorType::ACL_SRC_2, _impl->biases},
+                       {TensorType::ACL_DST, _impl->dst}};
+    _impl->workspace_tensors =
+        manage_workspace<CLTensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
-Status CLWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status CLWinogradConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info,
+                                            bool                       enable_fast_math)
 {
     return opencl::ClWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 }
@@ -97,7 +114,7 @@
 
 void CLWinogradConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -107,4 +124,4 @@
         _impl->is_prepared = true;
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
index 18ade97..4270165 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
@@ -34,8 +35,7 @@
 {
 namespace cl_gemm
 {
-CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeBifrost::CLGEMMDefaultTypeBifrost(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
@@ -44,109 +44,109 @@
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeBifrost::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Default configurations for Bifrost architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     // Mali-G71 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g71_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g71_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     // Mali-G52 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g52_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::g52_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g52_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     // Mali-G76 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g76_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeBifrost::g76_f32},
+        {DataType::F16, &CLGEMMDefaultTypeBifrost::g76_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeBifrost::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeBifrost::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
-            if(gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
+            if (gemm_g71_configs.find(data_type) != gemm_g71_configs.end())
             {
-                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g71_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G76:
-            if(gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
+            if (gemm_g76_configs.find(data_type) != gemm_g76_configs.end())
             {
-                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g76_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G52:
-            if(gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
+            if (gemm_g52_configs.find(data_type) != gemm_g52_configs.end())
             {
-                return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g52_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         default:
-            if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+            if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
             {
-                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                                params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
     CLGEMMKernelType gemm_type = CLGEMMKernelType::NATIVE;
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
-        if((m > 1) && (n < 16))
+        if ((m > 1) && (n < 16))
         {
             gemm_type = CLGEMMKernelType::RESHAPED;
         }
-        else if(m == 1)
+        else if (m == 1)
         {
             gemm_type = CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if((k > 256) && (m > 4))
+            if ((k > 256) && (m > 4))
             {
                 constexpr float alpha = 3.2f;
                 constexpr float fact0 = 1.51f;
                 constexpr float fact1 = 1.66f;
                 constexpr float ops   = 12.0f;
                 const float     scale = k > 1024 ? 1.07f : 1.0f;
-                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops)) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::RESHAPED_ONLY_RHS;
+                gemm_type             = (alpha + ((n * fact0) / ops) < ((fact1 * n * scale) / ops))
+                                            ? CLGEMMKernelType::RESHAPED
+                                            : CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
@@ -156,19 +156,21 @@
 
         const auto workload = static_cast<float>((m * n) / 20.0f);
 
-        gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED : gemm_type;
+        gemm_type = ((workload > 1600.0f) && (gemm_type == CLGEMMKernelType::RESHAPED)) ? CLGEMMKernelType::RESHAPED
+                                                                                        : gemm_type;
     }
 
     return gemm_type;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(n, k, b);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
-        if(m == 1)
+        if (m == 1)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
@@ -183,11 +185,12 @@
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeBifrost::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -197,21 +200,22 @@
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
-    if(k <= 496)
+    if (k <= 496)
     {
-        if(n <= 544)
+        if (n <= 544)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
@@ -222,17 +226,17 @@
     }
     else
     {
-        if(k <= 588)
+        if (k <= 588)
         {
-            if(k <= 552)
+            if (k <= 552)
             {
-                if(m <= 148)
+                if (m <= 148)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
                 else
                 {
-                    if(m <= 278)
+                    if (m <= 278)
                     {
                         return CLGEMMKernelType::RESHAPED;
                     }
@@ -254,16 +258,17 @@
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -273,13 +278,13 @@
     const float r_nk  = static_cast<float>(n) / static_cast<float>(k);
     const float r_mnk = static_cast<float>(m) / (static_cast<float>(n) * static_cast<float>(k));
 
-    if(r_mn <= 1.5469f)
+    if (r_mn <= 1.5469f)
     {
-        if(r_mk <= 0.8766f)
+        if (r_mk <= 0.8766f)
         {
-            if(r_mk <= 0.0211f)
+            if (r_mk <= 0.0211f)
             {
-                if(r_mnk <= 77.5833f)
+                if (r_mnk <= 77.5833f)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
@@ -290,7 +295,7 @@
             }
             else
             {
-                if(r_nk <= 0.0832f)
+                if (r_nk <= 0.0832f)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
@@ -302,11 +307,11 @@
         }
         else
         {
-            if(r_mnk <= 193.0000f)
+            if (r_mnk <= 193.0000f)
             {
-                if(r_mn <= 0.9948f)
+                if (r_mn <= 0.9948f)
                 {
-                    if(r_mk <= 2.5453f)
+                    if (r_mk <= 2.5453f)
                     {
                         return CLGEMMKernelType::RESHAPED;
                     }
@@ -328,17 +333,17 @@
     }
     else
     {
-        if(r_mn <= 17.7370f)
+        if (r_mn <= 17.7370f)
         {
-            if(r_mnk <= 1391.2875f)
+            if (r_mnk <= 1391.2875f)
             {
-                if(r_mk <= 2.9724f)
+                if (r_mk <= 2.9724f)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
                 else
                 {
-                    if(r_mnk <= 470.0000f)
+                    if (r_mnk <= 470.0000f)
                     {
                         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                     }
@@ -350,9 +355,9 @@
             }
             else
             {
-                if(r_nk <= 0.1381f)
+                if (r_nk <= 0.1381f)
                 {
-                    if(r_mnk <= 9040.5000f)
+                    if (r_mnk <= 9040.5000f)
                     {
                         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                     }
@@ -363,7 +368,7 @@
                 }
                 else
                 {
-                    if(r_mn <= 5.6790f)
+                    if (r_mn <= 5.6790f)
                     {
                         return CLGEMMKernelType::RESHAPED;
                     }
@@ -381,16 +386,17 @@
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -398,21 +404,21 @@
     const float r_mn = static_cast<float>(m) / static_cast<float>(n);
     const float r_nk = static_cast<float>(n) / static_cast<float>(k);
 
-    if(k <= 212)
+    if (k <= 212)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
     else
     {
-        if(r_nk <= 0.4990234375f)
+        if (r_nk <= 0.4990234375f)
         {
-            if(k <= 1392)
+            if (k <= 1392)
             {
                 return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                if(m <= 325)
+                if (m <= 325)
                 {
                     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                 }
@@ -424,13 +430,13 @@
         }
         else
         {
-            if(k <= 471)
+            if (k <= 471)
             {
                 return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                if(r_mn <= 0.04475911520421505f)
+                if (r_mn <= 0.04475911520421505f)
                 {
                     return CLGEMMKernelType::RESHAPED;
                 }
@@ -443,37 +449,38 @@
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
 
-    if(n <= 127.0000f)
+    if (n <= 127.0000f)
     {
-        if(n <= 63.5000f)
+        if (n <= 63.5000f)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if(m <= 3616.0000f)
+            if (m <= 3616.0000f)
             {
-                if(b <= 18.5000f)
+                if (b <= 18.5000f)
                 {
-                    if(m <= 2970.5000f)
+                    if (m <= 2970.5000f)
                     {
                         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                     }
                     else
                     {
-                        if(k <= 104.0000f)
+                        if (k <= 104.0000f)
                         {
                             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                         }
@@ -496,19 +503,19 @@
     }
     else
     {
-        if(m <= 12.5000f)
+        if (m <= 12.5000f)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if(k <= 104.0000f)
+            if (k <= 104.0000f)
             {
-                if(b <= 18.5000f)
+                if (b <= 18.5000f)
                 {
-                    if(m <= 490.0000f)
+                    if (m <= 490.0000f)
                     {
-                        if(n <= 272.0000f)
+                        if (n <= 272.0000f)
                         {
                             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                         }
@@ -529,11 +536,11 @@
             }
             else
             {
-                if(m <= 226.0000f)
+                if (m <= 226.0000f)
                 {
-                    if(n <= 140.0000f)
+                    if (n <= 140.0000f)
                     {
-                        if(m <= 179.5000f)
+                        if (m <= 179.5000f)
                         {
                             return CLGEMMKernelType::RESHAPED;
                         }
@@ -556,15 +563,16 @@
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeBifrost::g71_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
     ARM_COMPUTE_UNUSED(n);
     ARM_COMPUTE_UNUSED(k);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
-        if(m == 1)
+        if (m == 1)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }

diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
index ef30b28..673038a 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/GPUTarget.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
@@ -35,8 +36,7 @@
 {
 namespace cl_gemm
 {
-CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeMidgard::CLGEMMDefaultTypeMidgard(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
@@ -45,22 +45,21 @@
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeMidgard::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Configurations for Midgard architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeMidgard::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeMidgard::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeMidgard::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeMidgard::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    if(gemm_configs.find(data_type) != gemm_configs.end())
+    if (gemm_configs.find(data_type) != gemm_configs.end())
     {
         return (this->*gemm_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
     }
@@ -68,7 +67,8 @@
     ARM_COMPUTE_ERROR("Not supported data type");
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(n, k, b);
 
@@ -76,7 +76,8 @@
     return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(n, k, b);
 
@@ -84,7 +85,8 @@
     return ((m != 1) && is_rhs_constant) ? CLGEMMKernelType::RESHAPED : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeMidgard::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b, is_rhs_constant);
 

diff --git a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
index 9e779d3..851e23b 100644
--- a/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp
+++ b/src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 
 #include <map>
@@ -34,8 +35,7 @@
 {
 namespace cl_gemm
 {
-CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu)
-    : ICLGEMMKernelSelection(gpu)
+CLGEMMDefaultTypeValhall::CLGEMMDefaultTypeValhall(GPUTarget gpu) : ICLGEMMKernelSelection(gpu)
 {
 }
 
@@ -44,135 +44,136 @@
     // _target could be used in the future to have a dedicated heuristic for each GPU IP
     ARM_COMPUTE_UNUSED(_target);
 
-    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
+    using FunctionExecutorPtr = CLGEMMKernelType (CLGEMMDefaultTypeValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant);
 
     // Default configurations for Valhall architectures
-    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::default_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_default_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::default_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G77 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g77_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g77_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G78 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g78_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::g78_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g78_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G710 and Mali-G610 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::default_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g710_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::default_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g710_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     // Mali-G715 and Mali-G615 configurations
-    static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs =
-    {
-        { DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32 },
-        { DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16 },
-        { DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8 },
-        { DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8 }
-    };
+    static std::map<DataType, FunctionExecutorPtr> gemm_g715_configs = {
+        {DataType::F32, &CLGEMMDefaultTypeValhall::g715_f32},
+        {DataType::F16, &CLGEMMDefaultTypeValhall::g715_f16},
+        {DataType::QASYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QASYMM8_SIGNED, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8, &CLGEMMDefaultTypeValhall::default_q8},
+        {DataType::QSYMM8_PER_CHANNEL, &CLGEMMDefaultTypeValhall::default_q8}};
 
     const DataType data_type = params.data_type;
 
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G710:
         case GPUTarget::G610:
-            if(gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
+            if (gemm_g710_configs.find(data_type) != gemm_g710_configs.end())
             {
-                return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g710_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                             params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G715:
         case GPUTarget::G615:
-            if(gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
+            if (gemm_g715_configs.find(data_type) != gemm_g715_configs.end())
             {
-                return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g715_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                             params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G78:
-            if(gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
+            if (gemm_g78_configs.find(data_type) != gemm_g78_configs.end())
             {
-                return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g78_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         case GPUTarget::G77:
-            if(gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
+            if (gemm_g77_configs.find(data_type) != gemm_g77_configs.end())
             {
-                return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_g77_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                            params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
         default:
-            if(gemm_default_configs.find(data_type) != gemm_default_configs.end())
+            if (gemm_default_configs.find(data_type) != gemm_default_configs.end())
             {
-                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b, params.is_rhs_constant);
+                return (this->*gemm_default_configs[data_type])(params.m, params.n, params.k, params.b,
+                                                                params.is_rhs_constant);
             }
             ARM_COMPUTE_ERROR("Not supported data type");
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
     return is_rhs_constant ? CLGEMMKernelType::RESHAPED_ONLY_RHS : CLGEMMKernelType::NATIVE;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType CLGEMMDefaultTypeValhall::default_q8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
-    if(is_rhs_constant)
+    if (is_rhs_constant)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
@@ -182,47 +183,48 @@
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
 
-    if(m == 1)
+    if (m == 1)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
 
-    if(n <= 272.0000f)
+    if (n <= 272.0000f)
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS;
     }
     else
     {
-        if(k <= 471.0000f)
+        if (k <= 471.0000f)
         {
             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
         }
         else
         {
-            if(m <= 72.5000f)
+            if (m <= 72.5000f)
             {
                 return CLGEMMKernelType::RESHAPED_ONLY_RHS;
             }
             else
             {
-                if(m <= 90.5000f)
+                if (m <= 90.5000f)
                 {
                     return CLGEMMKernelType::RESHAPED;
                 }
                 else
                 {
-                    if(k <= 2448.0000f)
+                    if (k <= 2448.0000f)
                     {
-                        if(n <= 756.0000f)
+                        if (n <= 756.0000f)
                         {
                             return CLGEMMKernelType::RESHAPED_ONLY_RHS;
                         }
@@ -241,11 +243,12 @@
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
     ARM_COMPUTE_UNUSED(m, n, k, b);
 
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return CLGEMMKernelType::NATIVE;
     }
@@ -253,9 +256,10 @@
     return CLGEMMKernelType::RESHAPED_ONLY_RHS;
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return default_f32(m, n, k, b, is_rhs_constant);
     }
@@ -263,7 +267,7 @@
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
+    if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0))
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
     }
@@ -273,9 +277,10 @@
     }
 }
 
-CLGEMMKernelType CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
+CLGEMMKernelType
+CLGEMMDefaultTypeValhall::g715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool is_rhs_constant)
 {
-    if(!is_rhs_constant)
+    if (!is_rhs_constant)
     {
         return g78_f16(m, n, k, b, is_rhs_constant);
     }
@@ -283,7 +288,7 @@
     unsigned int best_m0;
     unsigned int best_n0;
 
-    if(opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
+    if (opencl::kernels::gemm::is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0))
     {
         return CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL;
     }

diff --git a/src/runtime/CL/gemm/CLGEMMKernelSelection.h b/src/runtime/CL/gemm/CLGEMMKernelSelection.h
index 6189a32..c528dbc 100644
--- a/src/runtime/CL/gemm/CLGEMMKernelSelection.h
+++ b/src/runtime/CL/gemm/CLGEMMKernelSelection.h

@@ -25,6 +25,7 @@
 #define SRC_CLGEMMKERNELSELECTION_H
 
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
 #include "src/runtime/CL/gemm/CLGEMMDefaultTypeBifrost.h"
 #include "src/runtime/CL/gemm/CLGEMMDefaultTypeMidgard.h"
 #include "src/runtime/CL/gemm/CLGEMMDefaultTypeValhall.h"
@@ -45,7 +46,7 @@
      */
     static std::unique_ptr<ICLGEMMKernelSelection> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<CLGEMMDefaultTypeMidgard>(gpu);

diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
index b06c3b0..8df5719 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/CL/CLScheduler.h"
 #include "arm_compute/runtime/CL/ICLGEMMKernelSelection.h"
+
 #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h"
 #include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h"
 #include "src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h"
@@ -51,13 +52,15 @@
     bool             valid = false;
     CLGEMMKernelType gemm_type{};
     const auto       mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, gemm_type) = mlgo_heuristics->get()->query_gemm_type(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.", to_string(gemm_type).c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm type: %s.",
+                                                  to_string(gemm_type).c_str());
     }
     else
     {
@@ -87,10 +90,11 @@
 {
     GEMMLHSMatrixInfo                    lhs_info;
     GEMMRHSMatrixInfo                    rhs_info;
-    std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
+    std::unique_ptr<IClGemmKernelConfig> gemm_config =
+        ClGemmReshapedOnlyRhsKernelConfigurationFactory::create(query.gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
     std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
-    return GEMMConfigResult{ true, lhs_info, rhs_info };
+    return GEMMConfigResult{true, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_mlgo_gemm_config_reshaped_only_rhs(const CommonQuery &query)
@@ -100,32 +104,36 @@
     GEMMRHSMatrixInfo               rhs_info;
     mlgo::GEMMConfigReshapedOnlyRHS config{};
     const auto                      mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped_only_rhs(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
         // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
-        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs,
-                                                              config.export_cl_image);
+        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, 1, config.h0, false, config.interleave_rhs,
+            !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
     }
     else
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
     }
-    return GEMMConfigResult{ valid, lhs_info, rhs_info };
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_default_gemm_config_reshaped(const CommonQuery &query)
 {
     GEMMLHSMatrixInfo                    lhs_info;
     GEMMRHSMatrixInfo                    rhs_info;
-    std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
+    std::unique_ptr<IClGemmKernelConfig> gemm_config =
+        ClGemmReshapedKernelConfigurationFactory::create(query.gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
     std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
-    return GEMMConfigResult{ true, lhs_info, rhs_info };
+    return GEMMConfigResult{true, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_mlgo_gemm_config_reshaped(const CommonQuery &query)
@@ -135,21 +143,24 @@
     GEMMRHSMatrixInfo        rhs_info;
     mlgo::GEMMConfigReshaped config{};
     const auto               mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_reshaped(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
-        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs, config.interleave_rhs, !config.transpose_rhs,
-                                                              config.transpose_rhs, config.export_cl_image);
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
+        std::tie(lhs_info, rhs_info) = configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, config.v0, config.h0, config.interleave_lhs,
+            config.interleave_rhs, !config.transpose_rhs, config.transpose_rhs, config.export_cl_image);
     }
     else
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
     }
-    return GEMMConfigResult{ valid, lhs_info, rhs_info };
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_default_gemm_config_native(const CommonQuery &query)
@@ -159,7 +170,7 @@
     std::unique_ptr<IClGemmKernelConfig> gemm_config = ClGemmNativeKernelConfigurationFactory::create(query.gpu_target);
     ARM_COMPUTE_ERROR_ON_NULLPTR(gemm_config.get());
     std::tie(lhs_info, rhs_info) = gemm_config->configure(query.m, query.n, query.k, query.b, query.data_type);
-    return GEMMConfigResult{ true, lhs_info, rhs_info };
+    return GEMMConfigResult{true, lhs_info, rhs_info};
 }
 
 GEMMConfigResult select_mlgo_gemm_config_native(const CommonQuery &query)
@@ -169,23 +180,26 @@
     GEMMRHSMatrixInfo      rhs_info;
     mlgo::GEMMConfigNative config{};
     const auto             mlgo_heuristics = CLScheduler::get().gemm_heuristics();
-    if(mlgo_heuristics != nullptr)
+    if (mlgo_heuristics != nullptr)
     {
-        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(mlgo::Query{ string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b });
+        std::tie(valid, config) = mlgo_heuristics->get()->query_gemm_config_native(
+            mlgo::Query{string_from_target(query.gpu_target), query.data_type, query.m, query.n, query.k, query.b});
     }
-    if(valid)
+    if (valid)
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.", to_string(config).c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics query returns gemm config: %s.",
+                                                  to_string(config).c_str());
         // Setting irrelevant unsigned int parameters to 1 and bool parameters to false as they do no matter
-        std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
+        std::tie(lhs_info, rhs_info) = opencl::kernels::gemm::configure_lhs_rhs_info(
+            query.m, query.n, config.m0, config.n0, config.k0, 1, 1, false, false, false, false, false);
     }
     else
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("MLGOHeuristics query failed");
     }
-    return GEMMConfigResult{ valid, lhs_info, rhs_info };
+    return GEMMConfigResult{valid, lhs_info, rhs_info};
 }
 } // namespace auto_heuristics
 
 } // namespace cl_gemm
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
index 020237b..f544715 100644
--- a/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h
+++ b/src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h

@@ -50,8 +50,7 @@
 /** Result of querying about GEMM type ( @ref CLGEMMKernelType) */
 struct GEMMTypeResult
 {
-    GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type)
-        : valid{ valid }, gemm_type{ gemm_type }
+    GEMMTypeResult(bool valid, CLGEMMKernelType gemm_type) : valid{valid}, gemm_type{gemm_type}
     {
     }
     /** Test if the result is valid */
@@ -67,7 +66,7 @@
 struct GEMMConfigResult
 {
     GEMMConfigResult(bool valid, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info)
-        : valid{ valid }, lhs_info{ lhs_info }, rhs_info{ rhs_info }
+        : valid{valid}, lhs_info{lhs_info}, rhs_info{rhs_info}
     {
     }
     /** Test if the result is valid */
@@ -134,4 +133,4 @@
 } // namespace cl_gemm
 } // namespace arm_compute
 
-#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H
\ No newline at end of file
+#endif // SRC_RUNTIME_CL_GEMM_AUTO_HEURISTICS_CL_GEMM_AUTO_HEURISTICS_H

diff --git a/src/runtime/CL/mlgo/Common.h b/src/runtime/CL/mlgo/Common.h
index c451bd9..08a7ee8 100644
--- a/src/runtime/CL/mlgo/Common.h
+++ b/src/runtime/CL/mlgo/Common.h

@@ -45,37 +45,37 @@
 /** GEMM Configuration for Native kernel */
 struct GEMMConfigNative
 {
-    unsigned int m0{ 1 }; /**< Number of rows processed by the matrix multiplication */
-    unsigned int n0{ 1 }; /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 }; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
 };
 
 /** GEMM Configuration for Reshaped Only RHS kernel */
 struct GEMMConfigReshapedOnlyRHS
 {
-    unsigned int m0{ 1 };                  /**< Number of rows processed by the matrix multiplication */
-    unsigned int n0{ 1 };                  /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                  /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int h0{ 1 };                  /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         interleave_rhs{ false };  /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         transpose_rhs{ false };   /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         transpose_rhs{false};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
 /** GEMM Configuration for Reshaped kernel */
 struct GEMMConfigReshaped
 {
-    unsigned int m0{ 1 };                  /**< Number of rows processed by the matrix multiplication */
-    unsigned int n0{ 1 };                  /**< Number of columns processed by the matrix multiplication */
-    unsigned int k0{ 1 };                  /**< Number of partial accumulations performed by the matrix multiplication */
-    unsigned int v0{ 1 };                  /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
-    unsigned int h0{ 1 };                  /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
-    bool         interleave_lhs{ false };  /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
-    bool         interleave_rhs{ false };  /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
-    bool         transpose_rhs{ false };   /**< True if the (k0xn0) block has to be transposed before been stored */
-    bool         export_cl_image{ false }; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
+    unsigned int m0{1}; /**< Number of rows processed by the matrix multiplication */
+    unsigned int n0{1}; /**< Number of columns processed by the matrix multiplication */
+    unsigned int k0{1}; /**< Number of partial accumulations performed by the matrix multiplication */
+    unsigned int v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */
+    unsigned int h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */
+    bool         interleave_lhs{false}; /**< True if the v0 (m0xk0) blocks have to be interleaved in the output row */
+    bool         interleave_rhs{false}; /**< True if the h0 (k0xn0) blocks have to be interleaved in the output row */
+    bool         transpose_rhs{false};  /**< True if the (k0xn0) block has to be transposed before been stored */
+    bool export_cl_image{false}; /**< True if the reshaped rhs has to be exported to cl_image. n0 must be equal to 4 */
 };
 
 } // namespace mlgo
 } // namespace arm_compute
-#endif // SRC_RUNTIME_CL_MLGO_COMMON_H
\ No newline at end of file
+#endif // SRC_RUNTIME_CL_MLGO_COMMON_H

diff --git a/src/runtime/CL/mlgo/HeuristicTree.cpp b/src/runtime/CL/mlgo/HeuristicTree.cpp
index 1c75cdc..f7b7069 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.cpp
+++ b/src/runtime/CL/mlgo/HeuristicTree.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "src/runtime/CL/mlgo/HeuristicTree.h"
+
 #include "arm_compute/core/Log.h"
 
 #include "support/Cast.h"
@@ -40,27 +41,23 @@
     // PRE: all features and ConditionalOps are valid
     constexpr float eps = 0.0001f;
     // Calculate all secondary features
-    std::vector<std::pair<std::string, float>> cond_values
-    {
-        { "m", static_cast<float>(shape.m) },
-        { "n", static_cast<float>(shape.n) },
-        { "k", static_cast<float>(shape.k) },
-        { "b", static_cast<float>(shape.b) },
-        { "r_mn", static_cast<float>(shape.m) / shape.n },
-        { "r_mk", static_cast<float>(shape.m) / shape.k },
-        { "r_nk", static_cast<float>(shape.n) / shape.k },
-        { "r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k) },
-        { "workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0 }
-    };
-    auto cond_value_pair_it = std::find_if(cond_values.begin(), cond_values.end(),
-                                           [&cond](decltype(*cond_values.begin()) it)
-    {
-        return it.first == cond.feature;
-    });
+    std::vector<std::pair<std::string, float>> cond_values{
+        {"m", static_cast<float>(shape.m)},
+        {"n", static_cast<float>(shape.n)},
+        {"k", static_cast<float>(shape.k)},
+        {"b", static_cast<float>(shape.b)},
+        {"r_mn", static_cast<float>(shape.m) / shape.n},
+        {"r_mk", static_cast<float>(shape.m) / shape.k},
+        {"r_nk", static_cast<float>(shape.n) / shape.k},
+        {"r_mnk", static_cast<float>(shape.m) / (static_cast<float>(shape.n) / shape.k)},
+        {"workload", (static_cast<float>(shape.m) * shape.n * shape.b) / 20.0}};
+    auto cond_value_pair_it =
+        std::find_if(cond_values.begin(), cond_values.end(),
+                     [&cond](decltype(*cond_values.begin()) it) { return it.first == cond.feature; });
 
     ARM_COMPUTE_ERROR_ON(cond_value_pair_it == cond_values.end());
     const float cond_value = cond_value_pair_it->second;
-    switch(cond.op)
+    switch (cond.op)
     {
         case ConditionalOp::LT:
         {
@@ -92,13 +89,12 @@
 constexpr size_t                HeuristicTree::_max_query_depth;
 constexpr HeuristicTree::NodeID HeuristicTree::_root;
 
-HeuristicTree::HeuristicTree()
-    : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
+HeuristicTree::HeuristicTree() : HeuristicTree(0, HeuristicType::GEMM_Type, "", DataType::F32)
 {
 }
 
 HeuristicTree::HeuristicTree(TreeID id, HeuristicType h_type, const std::string &ip_target, DataType data_type)
-    : _id{ id }, _heuristic_type{ h_type }, _ip_target{ ip_target }, _data_type{ data_type }, _tree{}
+    : _id{id}, _heuristic_type{h_type}, _ip_target{ip_target}, _data_type{data_type}, _tree{}
 {
 }
 
@@ -108,16 +104,17 @@
     // Root ID = 0;
     auto   cur_node = _tree.at(_root).get();
     size_t depth    = 0;
-    while(cur_node->type() != NodeType::Leaf)
+    while (cur_node->type() != NodeType::Leaf)
     {
-        if(depth > _max_query_depth)
+        if (depth > _max_query_depth)
         {
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?", _max_query_depth);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding max query depth: %zu. Is the tree too deep?",
+                                                      _max_query_depth);
             return std::make_pair(false, T{});
         }
         ARM_COMPUTE_ERROR_ON_MSG(cur_node->type() != NodeType::Branch, "Unexpected NodeType");
         auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
-        if(evaluate(shape, br_node->condition))
+        if (evaluate(shape, br_node->condition))
         {
             cur_node = _tree.at(br_node->true_node).get();
         }
@@ -135,12 +132,12 @@
 template <typename T>
 bool HeuristicTree::add_leaf(NodeID id, T val)
 {
-    if(_tree.size() >= _max_num_nodes)
+    if (_tree.size() >= _max_num_nodes)
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
         return false;
     }
-    if(_tree.find(id) != _tree.end())
+    if (_tree.find(id) != _tree.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
         return false;
@@ -151,28 +148,23 @@
 
 bool HeuristicTree::add_branch(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
 {
-    if(_tree.size() >= _max_num_nodes)
+    if (_tree.size() >= _max_num_nodes)
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the maximum number of nodes allowed %zu", _max_num_nodes);
         return false;
     }
 
-    const std::set<std::string> supported_features =
-    {
-        "m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"
-    };
-    const auto orig_feature = cond.feature;
-    std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(), [](char c)
-    {
-        return std::tolower(c);
-    });
-    if(supported_features.find(cond.feature) == supported_features.end())
+    const std::set<std::string> supported_features = {"m", "n", "k", "b", "r_mn", "r_mk", "r_nk", "r_mnk", "workload"};
+    const auto                  orig_feature       = cond.feature;
+    std::transform(cond.feature.begin(), cond.feature.end(), cond.feature.begin(),
+                   [](char c) { return std::tolower(c); });
+    if (supported_features.find(cond.feature) == supported_features.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Unsupported feature %s", orig_feature.c_str());
         return false;
     }
 
-    if(_tree.find(id) != _tree.end())
+    if (_tree.find(id) != _tree.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add node; node id %zu already exists", id);
         return false;
@@ -184,32 +176,32 @@
 bool HeuristicTree::check_if_structurally_correct() const
 {
     std::set<NodeID>   visited;
-    std::deque<NodeID> to_visit{ _root };
+    std::deque<NodeID> to_visit{_root};
 
-    while(!to_visit.empty())
+    while (!to_visit.empty())
     {
         auto id = to_visit.front();
         to_visit.pop_front();
-        if(_tree.find(id) == _tree.end())
+        if (_tree.find(id) == _tree.end())
         {
             ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing node %zu", id);
             return false;
         }
         auto not_seen_before = visited.insert(id);
-        if(!not_seen_before.second)
+        if (!not_seen_before.second)
         {
             ARM_COMPUTE_LOG_INFO_MSG_CORE("Not a tree; contains cycles or loops");
             return false;
         }
         auto cur_node = _tree.at(id).get();
-        if(cur_node->type() == NodeType::Branch)
+        if (cur_node->type() == NodeType::Branch)
         {
             auto br_node = utils::cast::polymorphic_downcast<BranchNode *>(cur_node);
             to_visit.push_back(br_node->true_node);
             to_visit.push_back(br_node->false_node);
         }
     }
-    if(visited.size() != _tree.size())
+    if (visited.size() != _tree.size())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Contains disjoint nodes");
         return false;
@@ -219,12 +211,12 @@
 
 bool HeuristicTree::check()
 {
-    if(_tree.empty())
+    if (_tree.empty())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Empty tree encountered");
         return false;
     }
-    if(_tree.find(_root) == _tree.end())
+    if (_tree.find(_root) == _tree.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Missing root. Root must have a Node ID of %zu", _root);
         return false;
@@ -237,7 +229,8 @@
 /** Explicit template instantiation @relates HeuristicTree */
 template std::pair<bool, GEMMConfigNative> HeuristicTree::query<GEMMConfigNative>(GEMMShape shape) const;
 /** Explicit template instantiation @relates HeuristicTree */
-template std::pair<bool, GEMMConfigReshapedOnlyRHS> HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
+template std::pair<bool, GEMMConfigReshapedOnlyRHS>
+HeuristicTree::query<GEMMConfigReshapedOnlyRHS>(GEMMShape shape) const;
 /** Explicit template instantiation @relates HeuristicTree */
 template std::pair<bool, GEMMConfigReshaped> HeuristicTree::query<GEMMConfigReshaped>(GEMMShape shape) const;
 

diff --git a/src/runtime/CL/mlgo/HeuristicTree.h b/src/runtime/CL/mlgo/HeuristicTree.h
index d5c7de2..a4f8c11 100644
--- a/src/runtime/CL/mlgo/HeuristicTree.h
+++ b/src/runtime/CL/mlgo/HeuristicTree.h

@@ -25,6 +25,7 @@
 #define SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
 
 #include "arm_compute/core/Types.h"
+
 #include "src/runtime/CL/mlgo/Common.h"
 
 #include <map>
@@ -84,7 +85,7 @@
     struct BranchNode : public Node
     {
         BranchNode(NodeID id, Condition cond, NodeID t_node, NodeID f_node)
-            : id{ id }, condition{ cond }, true_node{ t_node }, false_node{ f_node }
+            : id{id}, condition{cond}, true_node{t_node}, false_node{f_node}
         {
         }
         NodeType type() const override
@@ -100,8 +101,7 @@
     template <typename T>
     struct LeafNode : public Node
     {
-        LeafNode(NodeID id, T val)
-            : id{ id }, value{ val }
+        LeafNode(NodeID id, T val) : id{id}, value{val}
         {
         }
         NodeType type() const override
@@ -177,22 +177,22 @@
     bool check();
 
 private:
-    static constexpr size_t _max_query_depth{ 1000 }; // Maximum depth of query
-    static constexpr size_t _max_num_nodes{ 100000 }; // Maximum number of nodes contained by the tree
-    static constexpr NodeID _root{ 0 };               // Root tree ID
+    static constexpr size_t _max_query_depth{1000}; // Maximum depth of query
+    static constexpr size_t _max_num_nodes{100000}; // Maximum number of nodes contained by the tree
+    static constexpr NodeID _root{0};               // Root tree ID
 
 private:
     bool check_if_structurally_correct() const;
 
 private:
-    TreeID        _id;                             /**< Heuristic tree ID */
-    HeuristicType _heuristic_type;                 /**< Heuristic type */
-    std::string   _ip_target;                      /**< IP target associated with the tree */
-    DataType      _data_type;                      /**< Data type associated with the tree */
-    std::map<NodeID, std::unique_ptr<Node>> _tree; /**< Tree representation */
+    TreeID                                  _id;             /**< Heuristic tree ID */
+    HeuristicType                           _heuristic_type; /**< Heuristic type */
+    std::string                             _ip_target;      /**< IP target associated with the tree */
+    DataType                                _data_type;      /**< Data type associated with the tree */
+    std::map<NodeID, std::unique_ptr<Node>> _tree;           /**< Tree representation */
 };
 } // namespace mlgo
 
 } // namespace arm_compute
 
-#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H
\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_HEURISTIC_TREE_H

diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.cpp b/src/runtime/CL/mlgo/MLGOHeuristics.cpp
index 80f3bb8..aed46cd 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.cpp
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.cpp

@@ -24,6 +24,7 @@
 #include "src/runtime/CL/mlgo/MLGOHeuristics.h"
 
 #include "arm_compute/core/Log.h"
+
 #include "src/runtime/CL/mlgo/MLGOParser.h"
 #include "src/runtime/CL/mlgo/Utils.h"
 
@@ -39,19 +40,19 @@
 }
 bool operator==(const GEMMConfigReshapedOnlyRHS &lhs, const GEMMConfigReshapedOnlyRHS &rhs)
 {
-    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs,
-                                                                                                                            rhs.export_cl_image);
+    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.h0, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) ==
+           std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.h0, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
 }
 bool operator==(const GEMMConfigReshaped &lhs, const GEMMConfigReshaped &rhs)
 {
-    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs, lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0,
-            rhs.interleave_lhs, rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
+    return std::tie(lhs.m0, lhs.n0, lhs.k0, lhs.v0, lhs.h0, lhs.interleave_lhs, lhs.interleave_rhs, lhs.transpose_rhs,
+                    lhs.export_cl_image) == std::tie(rhs.m0, rhs.n0, rhs.k0, rhs.v0, rhs.h0, rhs.interleave_lhs,
+                                                     rhs.interleave_rhs, rhs.transpose_rhs, rhs.export_cl_image);
 }
 
 constexpr size_t MLGOHeuristics::_max_num_trees;
 
-MLGOHeuristics::MLGOHeuristics()
-    : _indices{}, _trees{}, _tree_valid{}, _valid{ false }
+MLGOHeuristics::MLGOHeuristics() : _indices{}, _trees{}, _tree_valid{}, _valid{false}
 {
 }
 
@@ -59,71 +60,74 @@
 {
     ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm type. %s.", to_string(query).c_str());
     const auto invalid = GEMMType::RESHAPED;
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Type, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMType>(shape_query);
 }
 std::pair<bool, GEMMConfigNative> MLGOHeuristics::query_gemm_config_native(const Query &query) const
 {
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.", to_string(query).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config native. %s.",
+                                              to_string(query).c_str());
     const auto invalid = GEMMConfigNative{};
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Config_Native, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMConfigNative>(shape_query);
 }
 std::pair<bool, GEMMConfigReshapedOnlyRHS> MLGOHeuristics::query_gemm_config_reshaped_only_rhs(const Query &query) const
 {
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.", to_string(query).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped only rhs. %s.",
+                                              to_string(query).c_str());
     const auto invalid = GEMMConfigReshapedOnlyRHS{};
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped_Only_RHS, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMConfigReshapedOnlyRHS>(shape_query);
 }
 std::pair<bool, GEMMConfigReshaped> MLGOHeuristics::query_gemm_config_reshaped(const Query &query) const
 {
-    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.", to_string(query).c_str());
+    ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("MLGOHeuristics querying gemm config reshaped. %s.",
+                                              to_string(query).c_str());
     const auto invalid = GEMMConfigReshaped{};
-    if(!_valid)
+    if (!_valid)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Invalid DotMLGO. Use default heuristics instead");
-        return { false, invalid };
+        return {false, invalid};
     }
     auto      index = std::make_tuple(HeuristicType::GEMM_Config_Reshaped, query.ip_target, query.data_type);
-    GEMMShape shape_query{ query.m, query.n, query.k, query.b };
-    if(_trees.find(index) == _trees.end())
+    GEMMShape shape_query{query.m, query.n, query.k, query.b};
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
-        return { false, invalid };
+        return {false, invalid};
     }
     return _trees.at(index).query<GEMMConfigReshaped>(shape_query);
 }
@@ -131,14 +135,14 @@
 bool MLGOHeuristics::check_heuristic_tree(HeuristicTree::TreeID id)
 {
     bool           status;
-    HeuristicTree *tree{ nullptr };
+    HeuristicTree *tree{nullptr};
     std::tie(status, tree) = get_heuristic_tree(id);
-    if(!status)
+    if (!status)
     {
         return status;
     }
     status = tree->check();
-    if(!status)
+    if (!status)
     {
         return status;
     }
@@ -149,14 +153,12 @@
 bool MLGOHeuristics::check_all() const
 {
     // Tree validities are already checked and cached.
-    bool all_trees_are_checked = std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v)
+    bool all_trees_are_checked =
+        std::find_if(_tree_valid.begin(), _tree_valid.end(), [](auto v) { return !v.second; }) == _tree_valid.end();
+    if (!all_trees_are_checked)
     {
-        return !v.second;
-    })
-    == _tree_valid.end();
-    if(!all_trees_are_checked)
-    {
-        ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each tree is completed. This could also indicate there are no trees in the dotmlgo");
+        ARM_COMPUTE_LOG_INFO_MSG_CORE("Missing checks on some trees. Make sure to call check_heuristic_tree after each "
+                                      "tree is completed. This could also indicate there are no trees in the dotmlgo");
         return false;
     }
 
@@ -167,14 +169,14 @@
 
 std::pair<bool, HeuristicTree *> MLGOHeuristics::get_heuristic_tree(HeuristicTree::TreeID id)
 {
-    if(_indices.find(id) == _indices.end())
+    if (_indices.find(id) == _indices.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot find tree with id %zu", id);
         return std::make_pair(false, nullptr);
     }
     const auto index = _indices[id];
 
-    if(_trees.find(index) == _trees.end())
+    if (_trees.find(index) == _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot find tree index");
         return std::make_pair(false, nullptr);
@@ -186,7 +188,7 @@
 
 bool MLGOHeuristics::add_heuristic_tree(HeuristicTree &&t)
 {
-    if(_indices.size() >= _max_num_trees)
+    if (_indices.size() >= _max_num_trees)
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Exceeding the max number of trees allowed: %zu", _max_num_trees);
         return false;
@@ -194,7 +196,7 @@
     // PRE: correctness of t is guaranteed by the tree construction process
     // Ensure unique id
     const auto id = t.id();
-    if(_indices.find(id) != _indices.end())
+    if (_indices.find(id) != _indices.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot add redundant trees; tree id %zu already exists", id);
         return false;
@@ -202,7 +204,7 @@
 
     // Ensure unique index
     const auto index = t.index();
-    if(_trees.find(index) != _trees.end())
+    if (_trees.find(index) != _trees.end())
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("Cannot add redundant trees; tree index already exists");
         return false;
@@ -219,9 +221,10 @@
     std::ifstream fs;
     fs.exceptions(std::ifstream::badbit);
     fs.open(filename, std::ios::in);
-    if(!fs.is_open())
+    if (!fs.is_open())
     {
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead", filename.c_str());
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Cannot open DotMLGO file %s. Use default heuristics instead",
+                                                  filename.c_str());
         return _valid = false;
     }
     return reload_from_stream(fs);
@@ -230,7 +233,7 @@
 bool MLGOHeuristics::reload_from_stream(std::istream &in)
 {
     auto parsed = parser::parse_mlgo(in);
-    if(!parsed.first)
+    if (!parsed.first)
     {
         ARM_COMPUTE_LOG_INFO_MSG_CORE("DotMLGO parsing failed. Use default heuristics instead");
         return _valid = false;
@@ -241,4 +244,4 @@
 }
 
 } // namespace mlgo
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/mlgo/MLGOHeuristics.h b/src/runtime/CL/mlgo/MLGOHeuristics.h
index aa21225..6a491c5 100644
--- a/src/runtime/CL/mlgo/MLGOHeuristics.h
+++ b/src/runtime/CL/mlgo/MLGOHeuristics.h

@@ -135,16 +135,16 @@
     bool check_all() const;
 
 private:
-    static constexpr size_t _max_num_trees{ 100 }; /**< Max number of trees that can be added*/
+    static constexpr size_t _max_num_trees{100}; /**< Max number of trees that can be added*/
 
 private:
     // There exists a one-to-one mappipng between TreeID and Index, either can be used to identify a @ref HeuristicTree
     std::map<HeuristicTree::TreeID, HeuristicTree::Index> _indices;    /**< A mapping from TreeID to Index */
     std::map<HeuristicTree::Index, HeuristicTree>         _trees;      /**< A mapping from Index to HeuristicTree */
     std::map<HeuristicTree::TreeID, bool>                 _tree_valid; /**< Result cache of the tree validity checks */
-    bool _valid;                                                       /**< Overall validity */
+    bool                                                  _valid;      /**< Overall validity */
 };
 
 } // namespace mlgo
 } // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H
\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_HEURISTICS_H

diff --git a/src/runtime/CL/mlgo/MLGOParser.cpp b/src/runtime/CL/mlgo/MLGOParser.cpp
index 625739e..893daf2 100644
--- a/src/runtime/CL/mlgo/MLGOParser.cpp
+++ b/src/runtime/CL/mlgo/MLGOParser.cpp

@@ -22,19 +22,21 @@
  * SOFTWARE.
  */
 #include "src/runtime/CL/mlgo/MLGOParser.h"
+
 #include "arm_compute/core/Log.h"
+
 #include "src/runtime/CL/mlgo/Utils.h"
 
 #include <sstream>
 
 #define CHECK(parser_expr, valid_var) \
     (parser_expr);                    \
-    if(!valid_var)                    \
+    if (!valid_var)                   \
         return;
 
 #define CHECK_DEFAULT(parser_expr, valid_var, default_val) \
     (parser_expr);                                         \
-    if(!valid_var)                                         \
+    if (!valid_var)                                        \
         return default_val;
 
 #ifdef ARM_COMPUTE_LOGGING_ENABLED
@@ -53,8 +55,7 @@
     valid_var = false;                                          \
     return default_val;
 
-#define LOG_TOKEN_POS(tokens, pos_var) \
-    const auto pos_var = tokens.current_pos();
+#define LOG_TOKEN_POS(tokens, pos_var) const auto pos_var = tokens.current_pos();
 
 #else // ARM_COMPUTE_LOGGING_ENABLED
 
@@ -73,19 +74,12 @@
 {
 void ltrim(std::string &str)
 {
-    str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch)
-    {
-        return !std::isspace(ch);
-    }));
+    str.erase(str.begin(), std::find_if(str.begin(), str.end(), [](char ch) { return !std::isspace(ch); }));
 }
 
 void rtrim(std::string &str)
 {
-    str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch)
-    {
-        return !std::isspace(ch);
-    }).base(),
-    str.end());
+    str.erase(std::find_if(str.rbegin(), str.rend(), [](char ch) { return !std::isspace(ch); }).base(), str.end());
 }
 
 void trim(std::string &str)
@@ -109,7 +103,7 @@
 };
 
 TokenStream::TokenStream(std::istream &s, const std::string &delims)
-    : _delims{ delims }, _istream{ s }, _tokens{}, _lookahead_pos{}
+    : _delims{delims}, _istream{s}, _tokens{}, _lookahead_pos{}
 {
     read();
 }
@@ -125,7 +119,7 @@
     ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
     Token t = _tokens.front();
     _tokens.pop_front();
-    if(_tokens.empty())
+    if (_tokens.empty())
     {
         read();
     }
@@ -136,7 +130,7 @@
     ARM_COMPUTE_ERROR_ON_MSG(_tokens.empty(), "TokenStream can never be empty");
     ARM_COMPUTE_ERROR_ON_MSG(i >= max_look_ahead, "TokenStream: Exceeding max look ahead");
     // NOTE: If i exceeds the stream (_istream.eof()), read() automatically appends a End token at the end
-    while(_istream && _tokens.size() <= i)
+    while (_istream && _tokens.size() <= i)
     {
         read();
     }
@@ -146,7 +140,7 @@
 
 void advance(CharPosition &pos, char ch)
 {
-    if(ch == '\n')
+    if (ch == '\n')
     {
         pos.ln += 1;
         pos.col = 0;
@@ -167,17 +161,16 @@
     do
     {
         // Reached eof
-        if(!_istream.get(ch))
+        if (!_istream.get(ch))
         {
-            if(!reached_end())
+            if (!reached_end())
             {
                 _tokens.emplace_back(TokenType::End, "", _lookahead_pos);
             }
             return;
         }
         advance(_lookahead_pos, ch);
-    }
-    while(std::isspace(ch) || is_delim(ch));
+    } while (std::isspace(ch) || is_delim(ch));
     // Read chars until we hit a delim or eof
     auto orig_pos = _lookahead_pos;
     auto tok      = recognize_tok(ch);
@@ -190,41 +183,41 @@
 
 Token TokenStream::recognize_tok(char ch)
 {
-    if(ch == '[')
+    if (ch == '[')
     {
-        return Token{ TokenType::L_List, "", _lookahead_pos };
+        return Token{TokenType::L_List, "", _lookahead_pos};
     }
-    else if(ch == ']')
+    else if (ch == ']')
     {
-        return Token{ TokenType::R_List, "", _lookahead_pos };
+        return Token{TokenType::R_List, "", _lookahead_pos};
     }
-    else if(ch == '.')
+    else if (ch == '.')
     {
-        return float_after_dp_st(std::string{ ch });
+        return float_after_dp_st(std::string{ch});
     }
-    else if(std::isdigit(ch))
+    else if (std::isdigit(ch))
     {
-        return num_st(std::string{ ch });
+        return num_st(std::string{ch});
     }
     else
     {
-        return text_st(std::string{ ch });
+        return text_st(std::string{ch});
     }
 }
 
 Token TokenStream::num_st(std::string value)
 {
     char ch{};
-    while(_istream.get(ch))
+    while (_istream.get(ch))
     {
         advance(_lookahead_pos, ch);
-        if(ch == '.')
+        if (ch == '.')
         {
             return float_after_dp_st(value + ch);
         }
-        else if(!std::isdigit(ch))
+        else if (!std::isdigit(ch))
         {
-            if(!is_delim(ch) && !std::isspace(ch))
+            if (!is_delim(ch) && !std::isspace(ch))
             {
                 rewind(_lookahead_pos);
                 _istream.unget();
@@ -233,18 +226,18 @@
         }
         value += ch;
     }
-    return Token{ TokenType::Int, value, _lookahead_pos };
+    return Token{TokenType::Int, value, _lookahead_pos};
 }
 
 Token TokenStream::float_after_dp_st(std::string value)
 {
     char ch{};
-    while(_istream.get(ch))
+    while (_istream.get(ch))
     {
         advance(_lookahead_pos, ch);
-        if(!std::isdigit(ch))
+        if (!std::isdigit(ch))
         {
-            if(!is_delim(ch) && !std::isspace(ch))
+            if (!is_delim(ch) && !std::isspace(ch))
             {
                 rewind(_lookahead_pos);
                 _istream.unget();
@@ -253,20 +246,20 @@
         }
         value += ch;
     }
-    return Token{ TokenType::Float, value, _lookahead_pos };
+    return Token{TokenType::Float, value, _lookahead_pos};
 }
 
 Token TokenStream::text_st(std::string value)
 {
     char ch{};
-    while(_istream.get(ch))
+    while (_istream.get(ch))
     {
         advance(_lookahead_pos, ch);
-        if(is_delim(ch))
+        if (is_delim(ch))
         {
             break;
         }
-        if(ch == '[' || ch == ']')
+        if (ch == '[' || ch == ']')
         {
             rewind(_lookahead_pos);
             _istream.unget();
@@ -274,7 +267,7 @@
         }
         value += ch;
     }
-    return Token{ TokenType::Text, value, _lookahead_pos };
+    return Token{TokenType::Text, value, _lookahead_pos};
 }
 
 bool TokenStream::reached_end() const
@@ -291,7 +284,7 @@
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::End)
+    if (tok.type != TokenType::End)
     {
         FAIL_WITH_MSG(valid, pos, "Unexpected token at the end of stream");
     }
@@ -301,7 +294,7 @@
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Int)
+    if (tok.type != TokenType::Int)
     {
         FAIL_WITH_MSG_DEFAULT(valid, false, pos, "Expect bool or int token");
     }
@@ -314,7 +307,7 @@
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Int)
+    if (tok.type != TokenType::Int)
     {
         FAIL_WITH_MSG_DEFAULT(valid, -1, pos, "Expect int token");
     }
@@ -327,7 +320,7 @@
 {
     LOG_TOKEN_POS(in, pos);
     int val = CHECK_DEFAULT(int_val(in, valid), valid, 0);
-    if(val < 0)
+    if (val < 0)
     {
         FAIL_WITH_MSG_DEFAULT(valid, 0, pos, "Expect unsigned int token");
     }
@@ -338,7 +331,7 @@
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Float)
+    if (tok.type != TokenType::Float)
     {
         FAIL_WITH_MSG_DEFAULT(valid, 0.f, pos, "Expect float token");
     }
@@ -351,7 +344,7 @@
 {
     LOG_TOKEN_POS(in, pos);
     auto tok = in.take();
-    if(tok.type != TokenType::Text || tok.value.empty())
+    if (tok.type != TokenType::Text || tok.value.empty())
     {
         FAIL_WITH_MSG_DEFAULT(valid, "", pos, "Expect a non-empty text token");
     }
@@ -361,9 +354,9 @@
 bool accept_text(TokenStream &in, const std::string &c_str, bool take = true)
 {
     auto tok = in.peek();
-    if(tok.type == TokenType::Text && tok.value == c_str)
+    if (tok.type == TokenType::Text && tok.value == c_str)
     {
-        if(take)
+        if (take)
         {
             in.take();
         }
@@ -375,7 +368,7 @@
 void expect_text(TokenStream &in, const std::string &str, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(!accept_text(in, str))
+    if (!accept_text(in, str))
     {
         FAIL_WITH_MSG(valid, pos, std::string("Expect text token: ") + str);
     }
@@ -384,7 +377,7 @@
 bool accept_l_list(TokenStream &in)
 {
     auto tok = in.peek();
-    if(tok.type == TokenType::L_List)
+    if (tok.type == TokenType::L_List)
     {
         in.take();
         return true;
@@ -395,7 +388,7 @@
 void expect_l_list(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(!accept_l_list(in))
+    if (!accept_l_list(in))
     {
         FAIL_WITH_MSG(valid, pos, "Expect '['");
     }
@@ -404,7 +397,7 @@
 bool accept_r_list(TokenStream &in)
 {
     auto tok = in.peek();
-    if(tok.type == TokenType::R_List)
+    if (tok.type == TokenType::R_List)
     {
         in.take();
         return true;
@@ -415,7 +408,7 @@
 void expect_r_list(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(!accept_r_list(in))
+    if (!accept_r_list(in))
     {
         FAIL_WITH_MSG(valid, pos, "Expect ']'");
     }
@@ -424,23 +417,23 @@
 ConditionalOp conditional_op(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "<="))
+    if (accept_text(in, "<="))
     {
         return ConditionalOp::LE;
     }
-    else if(accept_text(in, ">="))
+    else if (accept_text(in, ">="))
     {
         return ConditionalOp::GE;
     }
-    else if(accept_text(in, "=="))
+    else if (accept_text(in, "=="))
     {
         return ConditionalOp::EQ;
     }
-    else if(accept_text(in, "<"))
+    else if (accept_text(in, "<"))
     {
         return ConditionalOp::LT;
     }
-    else if(accept_text(in, ">"))
+    else if (accept_text(in, ">"))
     {
         return ConditionalOp::GT;
     }
@@ -464,11 +457,11 @@
 {
     CHECK(expect_text(in, "ip-type", valid), valid);
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "gpu"))
+    if (accept_text(in, "gpu"))
     {
         ;
     }
-    else if(accept_text(in, "cpu"))
+    else if (accept_text(in, "cpu"))
     {
         ;
     }
@@ -489,15 +482,15 @@
 DataType data_type(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "f16"))
+    if (accept_text(in, "f16"))
     {
         return DataType::F16;
     }
-    else if(accept_text(in, "f32"))
+    else if (accept_text(in, "f32"))
     {
         return DataType::F32;
     }
-    else if(accept_text(in, "qasymm8"))
+    else if (accept_text(in, "qasymm8"))
     {
         return DataType::QASYMM8;
     }
@@ -510,15 +503,15 @@
 ComparatorType comparator_type(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "var"))
+    if (accept_text(in, "var"))
     {
         return ComparatorType::Var;
     }
-    else if(accept_text(in, "num"))
+    else if (accept_text(in, "num"))
     {
         return ComparatorType::Num;
     }
-    else if(accept_text(in, "enum"))
+    else if (accept_text(in, "enum"))
     {
         return ComparatorType::Enum;
     }
@@ -531,19 +524,19 @@
 HeuristicType heuristic_type(TokenStream &in, bool &valid, bool take = true)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "gemm-type", take))
+    if (accept_text(in, "gemm-type", take))
     {
         return HeuristicType::GEMM_Type;
     }
-    else if(accept_text(in, "gemm-config-native", take))
+    else if (accept_text(in, "gemm-config-native", take))
     {
         return HeuristicType::GEMM_Config_Native;
     }
-    else if(accept_text(in, "gemm-config-reshaped-only-rhs", take))
+    else if (accept_text(in, "gemm-config-reshaped-only-rhs", take))
     {
         return HeuristicType::GEMM_Config_Reshaped_Only_RHS;
     }
-    else if(accept_text(in, "gemm-config-reshaped", take))
+    else if (accept_text(in, "gemm-config-reshaped", take))
     {
         return HeuristicType::GEMM_Config_Reshaped;
     }
@@ -557,7 +550,7 @@
 {
     LOG_TOKEN_POS(in, pos);
     auto ht = CHECK(heuristic_type(in, valid, false), valid);
-    if(ht != expected_ht)
+    if (ht != expected_ht)
     {
         FAIL_WITH_MSG(valid, pos, "Unexpected heuristic type");
     }
@@ -567,15 +560,15 @@
 GEMMType gemm_type(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "native"))
+    if (accept_text(in, "native"))
     {
         return GEMMType::NATIVE;
     }
-    else if(accept_text(in, "reshaped-only-rhs"))
+    else if (accept_text(in, "reshaped-only-rhs"))
     {
         return GEMMType::RESHAPED_ONLY_RHS;
     }
-    else if(accept_text(in, "reshaped"))
+    else if (accept_text(in, "reshaped"))
     {
         return GEMMType::RESHAPED;
     }
@@ -593,7 +586,7 @@
     const auto n0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
     const auto k0 = CHECK_DEFAULT(uint_val(in, valid), valid, invalid_val);
     CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
-    return GEMMConfigNative{ m0, n0, k0 };
+    return GEMMConfigNative{m0, n0, k0};
 }
 
 GEMMConfigReshapedOnlyRHS gemm_config_reshaped_only_rhs(TokenStream &in, bool &valid)
@@ -608,7 +601,7 @@
     const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
-    return GEMMConfigReshapedOnlyRHS{ m0, n0, k0, h0, ir, tr, ex };
+    return GEMMConfigReshapedOnlyRHS{m0, n0, k0, h0, ir, tr, ex};
 }
 
 GEMMConfigReshaped gemm_config_reshaped(TokenStream &in, bool &valid)
@@ -625,17 +618,17 @@
     const auto tr = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     const auto ex = CHECK_DEFAULT(bool_val(in, valid), valid, invalid_val);
     CHECK_DEFAULT(expect_r_list(in, valid), valid, invalid_val);
-    return GEMMConfigReshaped{ m0, n0, k0, v0, h0, il, ir, tr, ex };
+    return GEMMConfigReshaped{m0, n0, k0, v0, h0, il, ir, tr, ex};
 }
 
 void gpu_priority(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "best-performance"))
+    if (accept_text(in, "best-performance"))
     {
         ;
     }
-    else if(accept_text(in, "best-memory-usage"))
+    else if (accept_text(in, "best-memory-usage"))
     {
         ;
     }
@@ -648,11 +641,11 @@
 void gpu_behavior(TokenStream &in, bool &valid)
 {
     LOG_TOKEN_POS(in, pos);
-    if(accept_text(in, "static"))
+    if (accept_text(in, "static"))
     {
         ;
     }
-    else if(accept_text(in, "dynamic"))
+    else if (accept_text(in, "dynamic"))
     {
         ;
     }
@@ -665,7 +658,7 @@
 void free_vars(TokenStream &in, bool &valid)
 {
     CHECK(expect_l_list(in, valid), valid);
-    while(!accept_r_list(in))
+    while (!accept_r_list(in))
     {
         CHECK(text_val(in, valid), valid);
     }
@@ -688,7 +681,7 @@
 void heuristics_table(TokenStream &in, MLGOHeuristics &h, bool &valid)
 {
     CHECK(expect_text(in, "<heuristics-table>", valid), valid);
-    while(!accept_text(in, "</heuristics-table>"))
+    while (!accept_text(in, "</heuristics-table>"))
     {
         CHECK(heuristics_table_entry(in, h, valid), valid);
     }
@@ -705,11 +698,12 @@
     const auto c_o         = CHECK_DEFAULT(conditional_op(in, valid), valid, invalid_val);
     const auto r_t         = CHECK_DEFAULT(comparator_type(in, valid), valid, invalid_val);
     const auto r_v         = CHECK_DEFAULT(float_val(in, valid), valid, invalid_val);
-    if(l_t != ComparatorType::Var || r_t != ComparatorType::Num)
+    if (l_t != ComparatorType::Var || r_t != ComparatorType::Num)
     {
-        FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos, "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
+        FAIL_WITH_MSG_DEFAULT(valid, invalid_val, pos,
+                              "Only accept LHS type to be Var (string) and RHS type to be Num (float)");
     }
-    return Condition{ l_v, c_o, r_v };
+    return Condition{l_v, c_o, r_v};
 }
 
 void heuristic_tree(TokenStream &in, MLGOHeuristics &h, bool &valid)
@@ -717,13 +711,13 @@
     CHECK(expect_text(in, "<heuristic", valid), valid);
     const auto tree_id = CHECK(uint_val(in, valid), valid);
     CHECK(expect_text(in, ">", valid), valid);
-    HeuristicTree *t = nullptr;
-    std::tie(valid, t) = CHECK(h.get_heuristic_tree(tree_id), valid);
+    HeuristicTree *t                     = nullptr;
+    std::tie(valid, t)                   = CHECK(h.get_heuristic_tree(tree_id), valid);
     const HeuristicType t_heuristic_type = std::get<0>(t->index());
-    while(!accept_text(in, "</heuristic>"))
+    while (!accept_text(in, "</heuristic>"))
     {
         LOG_TOKEN_POS(in, pos);
-        if(accept_text(in, "b"))
+        if (accept_text(in, "b"))
         {
             // Branch node
             const auto id   = CHECK(uint_val(in, valid), valid);
@@ -732,7 +726,7 @@
             const auto f_id = CHECK(uint_val(in, valid), valid);
             valid           = CHECK(t->add_branch(id, cond, t_id, f_id), valid);
         }
-        else if(accept_text(in, "l"))
+        else if (accept_text(in, "l"))
         {
             // Leaf node
             const auto id = CHECK(uint_val(in, valid), valid);
@@ -740,7 +734,7 @@
             // heuristic table). For now it remains as a step for validation.
             LOG_TOKEN_POS(in, pos);
             CHECK(expect_heuristic_type(in, t_heuristic_type, valid), valid);
-            switch(t_heuristic_type)
+            switch (t_heuristic_type)
             {
                 case HeuristicType::GEMM_Type:
                 {
@@ -786,7 +780,7 @@
     MLGOHeuristics h;
     CHECK_DEFAULT(header(in, valid), valid, h);
     CHECK_DEFAULT(heuristics_table(in, h, valid), valid, h);
-    while(accept_text(in, "<heuristic", false))
+    while (accept_text(in, "<heuristic", false))
     {
         CHECK_DEFAULT(heuristic_tree(in, h, valid), valid, h);
     }
@@ -809,4 +803,4 @@
 #undef CHECK
 #undef CHECK_DEFAULT
 #undef FAIL_WITH_MSG
-#undef FAIL_WITH_MSG_DEFAULT
\ No newline at end of file
+#undef FAIL_WITH_MSG_DEFAULT

diff --git a/src/runtime/CL/mlgo/MLGOParser.h b/src/runtime/CL/mlgo/MLGOParser.h
index 49d8b9c..cffce8d 100644
--- a/src/runtime/CL/mlgo/MLGOParser.h
+++ b/src/runtime/CL/mlgo/MLGOParser.h

@@ -98,15 +98,14 @@
         return ln == other.ln && col == other.col;
     }
 
-    size_t ln{ 0 };
-    size_t col{ 0 };
+    size_t ln{0};
+    size_t col{0};
 };
 
 /** Token */
 struct Token
 {
-    Token(TokenType t, std::string v, CharPosition pos)
-        : type{ t }, value{ v }, pos{ pos }
+    Token(TokenType t, std::string v, CharPosition pos) : type{t}, value{v}, pos{pos}
     {
     }
 
@@ -196,4 +195,4 @@
 } // namespace parser
 } // namespace mlgo
 } // namespace arm_compute
-#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H
\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_MLGO_PARSER_H

diff --git a/src/runtime/CL/mlgo/Utils.cpp b/src/runtime/CL/mlgo/Utils.cpp
index 81d418c..c7e0100 100644
--- a/src/runtime/CL/mlgo/Utils.cpp
+++ b/src/runtime/CL/mlgo/Utils.cpp

@@ -43,40 +43,38 @@
 std::ostream &operator<<(std::ostream &os, const GEMMConfigNative &config)
 {
     return os << "Native:{"
-           << "m0: " << config.m0 << ", "
-           << "n0: " << config.n0 << ", "
-           << "k0: " << config.k0 << ", "
-           << "}";
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "}";
 }
 std::ostream &operator<<(std::ostream &os, const GEMMConfigReshapedOnlyRHS &config)
 {
     return os << "ReshapedOnlyRHS:{"
-           << "m0: " << config.m0 << ", "
-           << "n0: " << config.n0 << ", "
-           << "k0: " << config.k0 << ", "
-           << "h0: " << config.h0 << ", "
-           << "interleave_rhs: " << config.interleave_rhs << ", "
-           << "transpose_rhs: " << config.transpose_rhs << ", "
-           << "export_cl_image: " << config.export_cl_image
-           << "}";
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "h0: " << config.h0 << ", "
+              << "interleave_rhs: " << config.interleave_rhs << ", "
+              << "transpose_rhs: " << config.transpose_rhs << ", "
+              << "export_cl_image: " << config.export_cl_image << "}";
 }
 std::ostream &operator<<(std::ostream &os, const GEMMConfigReshaped &config)
 {
     return os << "Reshaped:{"
-           << "m0: " << config.m0 << ", "
-           << "n0: " << config.n0 << ", "
-           << "k0: " << config.k0 << ", "
-           << "v0: " << config.v0 << ", "
-           << "h0: " << config.h0 << ", "
-           << "interleave_lhs: " << config.interleave_lhs << ", "
-           << "interleave_rhs: " << config.interleave_rhs << ", "
-           << "transpose_rhs: " << config.transpose_rhs << ", "
-           << "export_cl_image: " << config.export_cl_image
-           << "}";
+              << "m0: " << config.m0 << ", "
+              << "n0: " << config.n0 << ", "
+              << "k0: " << config.k0 << ", "
+              << "v0: " << config.v0 << ", "
+              << "h0: " << config.h0 << ", "
+              << "interleave_lhs: " << config.interleave_lhs << ", "
+              << "interleave_rhs: " << config.interleave_rhs << ", "
+              << "transpose_rhs: " << config.transpose_rhs << ", "
+              << "export_cl_image: " << config.export_cl_image << "}";
 }
 std::ostream &operator<<(std::ostream &os, HeuristicType ht)
 {
-    switch(ht)
+    switch (ht)
     {
         case HeuristicType::GEMM_Type:
         {
@@ -103,7 +101,7 @@
 }
 std::ostream &operator<<(std::ostream &os, DataType dt)
 {
-    switch(dt)
+    switch (dt)
     {
         case DataType::F32:
         {
@@ -184,4 +182,4 @@
 
 } // namespace mlgo
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/CL/mlgo/Utils.h b/src/runtime/CL/mlgo/Utils.h
index c634a88..73b537f 100644
--- a/src/runtime/CL/mlgo/Utils.h
+++ b/src/runtime/CL/mlgo/Utils.h

@@ -43,10 +43,10 @@
 std::ostream &operator<<(std::ostream &os, DataType dt);
 std::ostream &operator<<(std::ostream &os, const HeuristicTree::Index &index);
 std::ostream &operator<<(std::ostream &os, const Query &query);
-std::string to_string(const GEMMConfigNative &config);
-std::string to_string(const GEMMConfigReshapedOnlyRHS &config);
-std::string to_string(const GEMMConfigReshaped &config);
-std::string to_string(const Query &query);
+std::string   to_string(const GEMMConfigNative &config);
+std::string   to_string(const GEMMConfigReshapedOnlyRHS &config);
+std::string   to_string(const GEMMConfigReshaped &config);
+std::string   to_string(const Query &query);
 namespace parser
 {
 std::ostream &operator<<(std::ostream &os, const CharPosition &pos);
@@ -54,4 +54,4 @@
 } // namespace mlgo
 } // namespace arm_compute
 
-#endif //SRC_RUNTIME_CL_MLGO_UTILS_H
\ No newline at end of file
+#endif //SRC_RUNTIME_CL_MLGO_UTILS_H

diff --git a/src/runtime/CL/tuners/CLTuningParametersList.cpp b/src/runtime/CL/tuners/CLTuningParametersList.cpp
index 6f3e324..5e3907f 100644
--- a/src/runtime/CL/tuners/CLTuningParametersList.cpp
+++ b/src/runtime/CL/tuners/CLTuningParametersList.cpp

@@ -27,20 +27,20 @@
 {
 namespace cl_tuner
 {
-constexpr unsigned int max_lws_supported_x{ 64u };
-constexpr unsigned int max_lws_supported_y{ 32u };
-constexpr unsigned int max_lws_supported_z{ 32u };
+constexpr unsigned int max_lws_supported_x{64u};
+constexpr unsigned int max_lws_supported_y{32u};
+constexpr unsigned int max_lws_supported_z{32u};
 
 /** Non instantiable base class for Tuning parameters combinations that use Index2Coord mapping */
 class CLTuningParametersList : public ICLTuningParametersList
 {
 protected:
     /* Shape of 4-D search space */
-    TensorShape               search_space_shape{ 0, 0, 0, 0 };
-    std::vector<unsigned int> _lws_x{ 0 };
-    std::vector<unsigned int> _lws_y{ 0 };
-    std::vector<unsigned int> _lws_z{ 0 };
-    std::vector<int>          _wbsm{ 0 }; /* Modify the batches size of workgroups distributed to compute units.
+    TensorShape               search_space_shape{0, 0, 0, 0};
+    std::vector<unsigned int> _lws_x{0};
+    std::vector<unsigned int> _lws_y{0};
+    std::vector<unsigned int> _lws_z{0};
+    std::vector<int>          _wbsm{0}; /* Modify the batches size of workgroups distributed to compute units.
                                              The value is in the range [-31,+31].
                                              When 0, the runtime-selected wbs used is unmodified. */
 
@@ -116,7 +116,8 @@
      * @param[in]      lws_max     Max LWS value allowed to be tested
      * @param[in]      mod_let_one True if the results of the modulo operation between gws and the lws can be less than one.
      */
-    void initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
+    void
+    initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one);
 };
 
 /** A minimal subset of LWS values that only have 1,2 and 4/8 */
@@ -170,9 +171,9 @@
     search_space_shape[1] = lws_y_max;
     search_space_shape[2] = lws_z_max;
     search_space_shape[3] = 1;
-    if(tuning_info.tune_wbsm)
+    if (tuning_info.tune_wbsm)
     {
-        _wbsm                 = { -3, -2, -1, 0, 1, 2, 3 };
+        _wbsm                 = {-3, -2, -1, 0, 1, 2, 3};
         search_space_shape[3] = _wbsm.size();
     }
 }
@@ -194,26 +195,31 @@
     _lws_x = {};
     _lws_y = {};
     _lws_z = {};
-    initialize_lws_values(_lws_x, gws[0], lws_x_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
-    initialize_lws_values(_lws_y, gws[1], lws_y_max, gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_x, gws[0], lws_x_max,
+                          gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
+    initialize_lws_values(_lws_y, gws[1], lws_y_max,
+                          gws[2] > 16); // Explore lws that are not factors of gws only when gws[2] > 16
     initialize_lws_values(_lws_z, gws[2], lws_z_max, false);
 
     search_space_shape[0] = _lws_x.size();
     search_space_shape[1] = _lws_y.size();
     search_space_shape[2] = _lws_z.size();
     search_space_shape[3] = 1;
-    if(tuning_info.tune_wbsm)
+    if (tuning_info.tune_wbsm)
     {
-        _wbsm                 = { -2, -1, 0, 1, 2 };
+        _wbsm                 = {-2, -1, 0, 1, 2};
         search_space_shape[3] = _wbsm.size();
     }
 }
 
-void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws, unsigned int gws, unsigned int lws_max, bool mod_let_one)
+void CLTuningParametersListNormal::initialize_lws_values(std::vector<unsigned int> &lws,
+                                                         unsigned int               gws,
+                                                         unsigned int               lws_max,
+                                                         bool                       mod_let_one)
 {
     lws.push_back(1);
 
-    for(unsigned int i = 2; i <= lws_max; ++i)
+    for (unsigned int i = 2; i <= lws_max; ++i)
     {
         // Power of two condition
         const bool is_power_of_two = (i & (i - 1)) == 0;
@@ -221,7 +227,7 @@
         // Condition for the module accordingly with the mod_let_one flag
         const bool mod_cond = mod_let_one ? (gws % i) <= 1 : (gws % i) == 0;
 
-        if(mod_cond || is_power_of_two)
+        if (mod_cond || is_power_of_two)
         {
             lws.push_back(i);
         }
@@ -246,9 +252,9 @@
     search_space_shape[1] = _lws_y.size();
     search_space_shape[2] = _lws_z.size();
     search_space_shape[3] = 1;
-    if(tuning_info.tune_wbsm)
+    if (tuning_info.tune_wbsm)
     {
-        _wbsm                 = { -1, 0, 1 };
+        _wbsm                 = {-1, 0, 1};
         search_space_shape[3] = _wbsm.size();
     }
 }
@@ -257,7 +263,7 @@
 {
     lws.push_back(1);
 
-    for(unsigned int i = 2; i <= lws_max; i *= 4)
+    for (unsigned int i = 2; i <= lws_max; i *= 4)
     {
         lws.push_back(i);
     }
@@ -265,7 +271,7 @@
 
 std::unique_ptr<ICLTuningParametersList> get_tuning_parameters_list(CLTuningInfo tuning_info, const cl::NDRange &gws)
 {
-    switch(tuning_info.tuner_mode)
+    switch (tuning_info.tuner_mode)
     {
         case CLTunerMode::EXHAUSTIVE:
             return std::make_unique<CLTuningParametersListExhaustive>(gws, tuning_info);

diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index 45e8724..9fbdc3a 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/Utility.h"
+
 #include "support/Mutex.h"
 
 #include <atomic>
@@ -53,8 +54,7 @@
      * @param[in] start First value that will be returned by the feeder
      * @param[in] end   End condition (The last value returned by get_next() will be end - 1)
      */
-    explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0)
-        : _atomic_counter(start), _end(end)
+    explicit ThreadFeeder(unsigned int start = 0, unsigned int end = 0) : _atomic_counter(start), _end(end)
     {
     }
     /** Return the next element in the range if there is one.
@@ -89,8 +89,7 @@
     {
         ARM_COMPUTE_ERROR_ON(workload_index >= workloads.size());
         workloads[workload_index](info);
-    }
-    while(feeder.get_next(workload_index));
+    } while (feeder.get_next(workload_index));
 }
 
 /** Set thread affinity. Pin current thread to a particular core
@@ -99,7 +98,7 @@
  */
 void set_thread_affinity(int core_id)
 {
-    if(core_id < 0)
+    if (core_id < 0)
     {
         return;
     }
@@ -150,10 +149,10 @@
      */
     explicit Thread(int core_pin = -1);
 
-    Thread(const Thread &) = delete;
+    Thread(const Thread &)            = delete;
     Thread &operator=(const Thread &) = delete;
     Thread(Thread &&)                 = delete;
-    Thread &operator=(Thread &&) = delete;
+    Thread &operator=(Thread &&)      = delete;
 
     /** Destructor. Make the thread join. */
     ~Thread();
@@ -196,21 +195,20 @@
 private:
     std::thread                        _thread{};
     ThreadInfo                         _info{};
-    std::vector<IScheduler::Workload> *_workloads{ nullptr };
-    ThreadFeeder                      *_feeder{ nullptr };
+    std::vector<IScheduler::Workload> *_workloads{nullptr};
+    ThreadFeeder                      *_feeder{nullptr};
     std::mutex                         _m{};
     std::condition_variable            _cv{};
-    bool                               _wait_for_work{ false };
-    bool                               _job_complete{ true };
-    std::exception_ptr                 _current_exception{ nullptr };
-    int                                _core_pin{ -1 };
-    std::list<Thread>                 *_thread_pool{ nullptr };
-    unsigned int                       _wake_beg{ 0 };
-    unsigned int                       _wake_end{ 0 };
+    bool                               _wait_for_work{false};
+    bool                               _job_complete{true};
+    std::exception_ptr                 _current_exception{nullptr};
+    int                                _core_pin{-1};
+    std::list<Thread>                 *_thread_pool{nullptr};
+    unsigned int                       _wake_beg{0};
+    unsigned int                       _wake_end{0};
 };
 
-Thread::Thread(int core_pin)
-    : _core_pin(core_pin)
+Thread::Thread(int core_pin) : _core_pin(core_pin)
 {
     _thread = std::thread(&Thread::worker_thread, this);
 }
@@ -218,7 +216,7 @@
 Thread::~Thread()
 {
     // Make sure worker thread has ended
-    if(_thread.joinable())
+    if (_thread.joinable())
     {
         ThreadFeeder feeder;
         set_workload(nullptr, feeder, ThreadInfo());
@@ -257,7 +255,7 @@
 {
     set_thread_affinity(_core_pin);
 
-    while(true)
+    while (true)
     {
         std::unique_lock<std::mutex> lock(_m);
         _cv.wait(lock, [&] { return _wait_for_work; });
@@ -266,18 +264,18 @@
         _current_exception = nullptr;
 
         // Exit if the worker thread has not been fed with workloads
-        if(_workloads == nullptr || _feeder == nullptr)
+        if (_workloads == nullptr || _feeder == nullptr)
         {
             return;
         }
 
         // Wake up more peer threads from thread pool if this job has been delegated to the current thread
-        if(_thread_pool != nullptr)
+        if (_thread_pool != nullptr)
         {
             auto thread_it = _thread_pool->begin();
             std::advance(thread_it, std::min(static_cast<unsigned int>(_thread_pool->size()), _wake_beg));
             auto wake_end = std::min(_wake_end, static_cast<unsigned int>(_info.num_threads - 1));
-            for(unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it)
+            for (unsigned int t = _wake_beg; t < wake_end; ++t, ++thread_it)
             {
                 thread_it->start();
             }
@@ -291,7 +289,7 @@
 
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
         }
-        catch(...)
+        catch (...)
         {
             _current_exception = std::current_exception();
         }
@@ -322,11 +320,11 @@
         : _num_threads(thread_hint), _threads(_num_threads - 1), _mode(Mode::Linear), _wake_fanout(0U)
     {
         const auto mode_env_v = utility::tolower(utility::getenv("ARM_COMPUTE_CPP_SCHEDULER_MODE"));
-        if(mode_env_v == "linear")
+        if (mode_env_v == "linear")
         {
             _forced_mode = ModeToggle::Linear;
         }
-        else if(mode_env_v == "fanout")
+        else if (mode_env_v == "fanout")
         {
             _forced_mode = ModeToggle::Fanout;
         }
@@ -350,7 +348,7 @@
 
         // Set affinity on worked threads
         _threads.clear();
-        for(auto i = 1U; i < _num_threads; ++i)
+        for (auto i = 1U; i < _num_threads; ++i)
         {
             _threads.emplace_back(func(i, thread_hint));
         }
@@ -359,20 +357,23 @@
     void auto_switch_mode(unsigned int num_threads_to_use)
     {
         // If the environment variable is set to any of the modes, it overwrites the mode selected over num_threads_to_use
-        if(_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8))
+        if (_forced_mode == ModeToggle::Fanout || (_forced_mode == ModeToggle::None && num_threads_to_use > 8))
         {
             set_fanout_mode(m_default_wake_fanout, num_threads_to_use);
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n", this->wake_fanout(), num_threads_to_use);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+                "Set CPPScheduler to Fanout mode, with wake up fanout : %d and %d threads to use\n",
+                this->wake_fanout(), num_threads_to_use);
         }
         else // Equivalent to (_forced_mode == ModeToggle::Linear || (_forced_mode == ModeToggle::None && num_threads_to_use <= 8))
         {
             set_linear_mode();
-            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n", num_threads_to_use);
+            ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Set CPPScheduler to Linear mode, with %d threads to use\n",
+                                                      num_threads_to_use);
         }
     }
     void set_linear_mode()
     {
-        for(auto &thread : _threads)
+        for (auto &thread : _threads)
         {
             thread.set_linear_mode();
         }
@@ -384,14 +385,14 @@
         ARM_COMPUTE_ERROR_ON(num_threads_to_use > _threads.size() + 1);
         const auto actual_wake_fanout = std::max(2U, std::min(wake_fanout, num_threads_to_use - 1));
         auto       thread_it          = _threads.begin();
-        for(auto i = 1U; i < num_threads_to_use; ++i, ++thread_it)
+        for (auto i = 1U; i < num_threads_to_use; ++i, ++thread_it)
         {
             const auto wake_begin = i * actual_wake_fanout - 1;
             const auto wake_end   = std::min((i + 1) * actual_wake_fanout - 1, num_threads_to_use - 1);
             thread_it->set_fanout_mode(&_threads, wake_begin, wake_end);
         }
         // Reset the remaining threads's wake up schedule
-        while(thread_it != _threads.end())
+        while (thread_it != _threads.end())
         {
             thread_it->set_fanout_mode(&_threads, 0U, 0U);
             ++thread_it;
@@ -417,9 +418,9 @@
     unsigned int       _num_threads;
     std::list<Thread>  _threads;
     arm_compute::Mutex _run_workloads_mutex{};
-    Mode               _mode{ Mode::Linear };
-    ModeToggle         _forced_mode{ ModeToggle::None };
-    unsigned int       _wake_fanout{ 0 };
+    Mode               _mode{Mode::Linear};
+    ModeToggle         _forced_mode{ModeToggle::None};
+    unsigned int       _wake_fanout{0};
 };
 
 /*
@@ -431,8 +432,7 @@
     return scheduler;
 }
 
-CPPScheduler::CPPScheduler()
-    : _impl(std::make_unique<Impl>(num_threads_hint()))
+CPPScheduler::CPPScheduler() : _impl(std::make_unique<Impl>(num_threads_hint()))
 {
 }
 
@@ -465,15 +465,15 @@
     // This is not great because different threads workloads won't run in parallel but at least they
     // won't interfere each other and deadlock.
     arm_compute::lock_guard<std::mutex> lock(_impl->_run_workloads_mutex);
-    const unsigned int                  num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
-    if(num_threads_to_use < 1)
+    const unsigned int num_threads_to_use = std::min(_impl->num_threads(), static_cast<unsigned int>(workloads.size()));
+    if (num_threads_to_use < 1)
     {
         return;
     }
     // Re-adjust the mode if the actual number of threads to use is different from the number of threads created
     _impl->auto_switch_mode(num_threads_to_use);
     int num_threads_to_start = 0;
-    switch(_impl->mode())
+    switch (_impl->mode())
     {
         case CPPScheduler::Impl::Mode::Fanout:
         {
@@ -494,22 +494,22 @@
     unsigned int t         = 0;
     auto         thread_it = _impl->_threads.begin();
     // Set num_threads_to_use - 1 workloads to the threads as the remaining 1 is left to the main thread
-    for(; t < num_threads_to_use - 1; ++t, ++thread_it)
+    for (; t < num_threads_to_use - 1; ++t, ++thread_it)
     {
         info.thread_id = t;
         thread_it->set_workload(&workloads, feeder, info);
     }
     thread_it = _impl->_threads.begin();
-    for(int i = 0; i < num_threads_to_start; ++i, ++thread_it)
+    for (int i = 0; i < num_threads_to_start; ++i, ++thread_it)
     {
         thread_it->start();
     }
-    info.thread_id = t;                         // Set main thread's thread_id
+    info.thread_id                    = t; // Set main thread's thread_id
     std::exception_ptr last_exception = nullptr;
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     try
     {
-#endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
+#endif                                              /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         process_workloads(workloads, feeder, info); // Main thread processes workloads
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
@@ -522,7 +522,7 @@
     {
 #endif /* ARM_COMPUTE_EXCEPTIONS_DISABLED */
         thread_it = _impl->_threads.begin();
-        for(unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it)
+        for (unsigned int i = 0; i < num_threads_to_use - 1; ++i, ++thread_it)
         {
             std::exception_ptr current_exception = thread_it->wait();
             if (current_exception)
@@ -536,7 +536,7 @@
         }
 #ifndef ARM_COMPUTE_EXCEPTIONS_DISABLED
     }
-    catch(const std::system_error &e)
+    catch (const std::system_error &e)
     {
         std::cerr << "Caught system_error with code " << e.code() << " meaning " << e.what() << '\n';
     }

diff --git a/src/runtime/CPP/SingleThreadScheduler.cpp b/src/runtime/CPP/SingleThreadScheduler.cpp
index 5890553..c46a273 100644
--- a/src/runtime/CPP/SingleThreadScheduler.cpp
+++ b/src/runtime/CPP/SingleThreadScheduler.cpp

@@ -39,10 +39,10 @@
 {
     const Window &max_window = kernel->window();
 
-    if(hints.split_dimension() != IScheduler::split_dimensions_all)
+    if (hints.split_dimension() != IScheduler::split_dimensions_all)
     {
         const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-        if(num_iterations < 1)
+        if (num_iterations < 1)
         {
             return;
         }
@@ -53,7 +53,10 @@
     kernel->run(kernel->window(), info);
 }
 
-void SingleThreadScheduler::schedule_op(ICPPKernel *kernel, const Hints &hints, const Window &window, ITensorPack &tensors)
+void SingleThreadScheduler::schedule_op(ICPPKernel   *kernel,
+                                        const Hints  &hints,
+                                        const Window &window,
+                                        ITensorPack  &tensors)
 {
     ARM_COMPUTE_UNUSED(hints);
     ThreadInfo info;
@@ -65,7 +68,7 @@
 {
     ThreadInfo info;
     info.cpu_info = &cpu_info();
-    for(auto &wl : workloads)
+    for (auto &wl : workloads)
     {
         wl(info);
     }

diff --git a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
index dccbe40..94a1673 100644
--- a/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp
+++ b/src/runtime/CPP/functions/CPPBoxWithNonMaximaSuppressionLimit.cpp

@@ -42,28 +42,37 @@
     Iterator input_it(input, window);
     Iterator output_it(output, window);
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize(*reinterpret_cast<const uint8_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM8_SIGNED:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize_qasymm8_signed(*reinterpret_cast<const int8_t *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM16:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<float *>(output_it.ptr()) = dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<float *>(output_it.ptr()) =
+                        dequantize(*reinterpret_cast<const uint16_t *>(input_it.ptr()), qinfo.scale, qinfo.offset);
+                },
+                input_it, output_it);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type");
@@ -80,28 +89,37 @@
     Iterator input_it(input, window);
     Iterator output_it(output, window);
 
-    switch(data_type)
+    switch (data_type)
     {
         case DataType::QASYMM8:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<uint8_t *>(output_it.ptr()) = quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<uint8_t *>(output_it.ptr()) =
+                        quantize_qasymm8(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM8_SIGNED:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<int8_t *>(output_it.ptr()) = quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<int8_t *>(output_it.ptr()) =
+                        quantize_qasymm8_signed(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         case DataType::QASYMM16:
-            execute_window_loop(window, [&](const Coordinates &)
-            {
-                *reinterpret_cast<uint16_t *>(output_it.ptr()) = quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
-            },
-            input_it, output_it);
+            execute_window_loop(
+                window,
+                [&](const Coordinates &)
+                {
+                    *reinterpret_cast<uint16_t *>(output_it.ptr()) =
+                        quantize_qasymm16(*reinterpret_cast<const float *>(input_it.ptr()), qinfo);
+                },
+                input_it, output_it);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported data type");
@@ -132,14 +150,23 @@
 {
 }
 
-void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor *scores_in, const ITensor *boxes_in, const ITensor *batch_splits_in,
-                                                    ITensor *scores_out, ITensor *boxes_out, ITensor *classes, ITensor *batch_splits_out,
-                                                    ITensor *keeps, ITensor *keeps_size, const BoxNMSLimitInfo info)
+void CPPBoxWithNonMaximaSuppressionLimit::configure(const ITensor        *scores_in,
+                                                    const ITensor        *boxes_in,
+                                                    const ITensor        *batch_splits_in,
+                                                    ITensor              *scores_out,
+                                                    ITensor              *boxes_out,
+                                                    ITensor              *classes,
+                                                    ITensor              *batch_splits_out,
+                                                    ITensor              *keeps,
+                                                    ITensor              *keeps_size,
+                                                    const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
-    ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+    ARM_COMPUTE_LOG_PARAMS(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out,
+                           keeps, keeps_size, info);
 
-    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 || scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
+    _is_qasymm8 = scores_in->info()->data_type() == DataType::QASYMM8 ||
+                  scores_in->info()->data_type() == DataType::QASYMM8_SIGNED;
 
     _scores_in        = scores_in;
     _boxes_in         = boxes_in;
@@ -150,7 +177,7 @@
     _batch_splits_out = batch_splits_out;
     _keeps            = keeps;
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         // Manage intermediate buffers
         _memory_group.manage(&_scores_in_f32);
@@ -160,7 +187,7 @@
         _memory_group.manage(&_classes_f32);
         _scores_in_f32.allocator()->init(scores_in->info()->clone()->set_data_type(DataType::F32));
         _boxes_in_f32.allocator()->init(boxes_in->info()->clone()->set_data_type(DataType::F32));
-        if(batch_splits_in != nullptr)
+        if (batch_splits_in != nullptr)
         {
             _memory_group.manage(&_batch_splits_in_f32);
             _batch_splits_in_f32.allocator()->init(batch_splits_in->info()->clone()->set_data_type(DataType::F32));
@@ -168,58 +195,70 @@
         _scores_out_f32.allocator()->init(scores_out->info()->clone()->set_data_type(DataType::F32));
         _boxes_out_f32.allocator()->init(boxes_out->info()->clone()->set_data_type(DataType::F32));
         _classes_f32.allocator()->init(classes->info()->clone()->set_data_type(DataType::F32));
-        if(batch_splits_out != nullptr)
+        if (batch_splits_out != nullptr)
         {
             _memory_group.manage(&_batch_splits_out_f32);
             _batch_splits_out_f32.allocator()->init(batch_splits_out->info()->clone()->set_data_type(DataType::F32));
         }
-        if(keeps != nullptr)
+        if (keeps != nullptr)
         {
             _memory_group.manage(&_keeps_f32);
             _keeps_f32.allocator()->init(keeps->info()->clone()->set_data_type(DataType::F32));
         }
 
-        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32, (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
+        _box_with_nms_limit_kernel.configure(&_scores_in_f32, &_boxes_in_f32,
+                                             (batch_splits_in != nullptr) ? &_batch_splits_in_f32 : nullptr,
                                              &_scores_out_f32, &_boxes_out_f32, &_classes_f32,
-                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr, (keeps != nullptr) ? &_keeps_f32 : nullptr,
-                                             keeps_size, info);
+                                             (batch_splits_out != nullptr) ? &_batch_splits_out_f32 : nullptr,
+                                             (keeps != nullptr) ? &_keeps_f32 : nullptr, keeps_size, info);
     }
     else
     {
-        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes, batch_splits_out, keeps, keeps_size, info);
+        _box_with_nms_limit_kernel.configure(scores_in, boxes_in, batch_splits_in, scores_out, boxes_out, classes,
+                                             batch_splits_out, keeps, keeps_size, info);
     }
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _scores_in_f32.allocator()->allocate();
         _boxes_in_f32.allocator()->allocate();
-        if(_batch_splits_in != nullptr)
+        if (_batch_splits_in != nullptr)
         {
             _batch_splits_in_f32.allocator()->allocate();
         }
         _scores_out_f32.allocator()->allocate();
         _boxes_out_f32.allocator()->allocate();
         _classes_f32.allocator()->allocate();
-        if(batch_splits_out != nullptr)
+        if (batch_splits_out != nullptr)
         {
             _batch_splits_out_f32.allocator()->allocate();
         }
-        if(keeps != nullptr)
+        if (keeps != nullptr)
         {
             _keeps_f32.allocator()->allocate();
         }
     }
 }
 
-Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes,
-                const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
+Status validate(const ITensorInfo    *scores_in,
+                const ITensorInfo    *boxes_in,
+                const ITensorInfo    *batch_splits_in,
+                const ITensorInfo    *scores_out,
+                const ITensorInfo    *boxes_out,
+                const ITensorInfo    *classes,
+                const ITensorInfo    *batch_splits_out,
+                const ITensorInfo    *keeps,
+                const ITensorInfo    *keeps_size,
+                const BoxNMSLimitInfo info)
 {
     ARM_COMPUTE_UNUSED(batch_splits_in, batch_splits_out, keeps, keeps_size, info);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores_in, boxes_in, scores_out, boxes_out, classes);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores_in, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED,
+                                                         DataType::F16, DataType::F32);
 
-    const bool is_qasymm8 = scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
-    if(is_qasymm8)
+    const bool is_qasymm8 =
+        scores_in->data_type() == DataType::QASYMM8 || scores_in->data_type() == DataType::QASYMM8_SIGNED;
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(boxes_in, 1, DataType::QASYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(boxes_in, boxes_out);
@@ -237,11 +276,11 @@
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         dequantize_tensor(_scores_in, &_scores_in_f32);
         dequantize_tensor(_boxes_in, &_boxes_in_f32);
-        if(_batch_splits_in != nullptr)
+        if (_batch_splits_in != nullptr)
         {
             dequantize_tensor(_batch_splits_in, &_batch_splits_in_f32);
         }
@@ -249,16 +288,16 @@
 
     Scheduler::get().schedule(&_box_with_nms_limit_kernel, Window::DimY);
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         quantize_tensor(&_scores_out_f32, _scores_out);
         quantize_tensor(&_boxes_out_f32, _boxes_out);
         quantize_tensor(&_classes_f32, _classes);
-        if(_batch_splits_out != nullptr)
+        if (_batch_splits_out != nullptr)
         {
             quantize_tensor(&_batch_splits_out_f32, _batch_splits_out);
         }
-        if(_keeps != nullptr)
+        if (_keeps != nullptr)
         {
             quantize_tensor(&_keeps_f32, _keeps);
         }

diff --git a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
index 41d875e..e6291f9 100644
--- a/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionOutputLayer.cpp

@@ -26,9 +26,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <list>
 
@@ -36,25 +36,35 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status validate_arguments(const ITensorInfo       *input_loc,
+                          const ITensorInfo       *input_conf,
+                          const ITensorInfo       *input_priorbox,
+                          const ITensorInfo       *output,
+                          DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_loc, 1, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, input_conf, input_priorbox);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_loc->num_dimensions() > 2, "The location input tensor should be [C1, N].");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_conf->num_dimensions() > 2, "The location input tensor should be [C2, N].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3, "The priorbox input tensor should be [C3, 2, N].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_priorbox->num_dimensions() > 3,
+                                    "The priorbox input tensor should be [C3, 2, N].");
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.eta() <= 0.f && info.eta() > 1.f, "Eta should be between 0 and 1");
 
     const int num_priors = input_priorbox->tensor_shape()[0] / 4;
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) != input_loc->tensor_shape()[0], "Number of priors must match number of location predictions.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) != input_conf->tensor_shape()[0], "Number of priors must match number of confidence predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_loc_classes() * 4)) !=
+                                        input_loc->tensor_shape()[0],
+                                    "Number of priors must match number of location predictions.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(static_cast<size_t>((num_priors * info.num_classes())) !=
+                                        input_conf->tensor_shape()[0],
+                                    "Number of priors must match number of confidence predictions.");
 
     // Validate configured output
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const unsigned int max_size = info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
+        const unsigned int max_size =
+            info.keep_top_k() * (input_loc->num_dimensions() > 1 ? input_loc->dimension(1) : 1);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), TensorShape(7U, max_size));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_loc, output);
     }
@@ -65,8 +75,7 @@
 /** Function used to sort pair<float, T> in descend order based on the score (first) value.
  */
 template <typename T>
-bool SortScorePairDescend(const std::pair<float, T> &pair1,
-                          const std::pair<float, T> &pair2)
+bool SortScorePairDescend(const std::pair<float, T> &pair1, const std::pair<float, T> &pair2)
 {
     return pair1.first > pair2.first;
 }
@@ -82,16 +91,19 @@
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_loc_predictions(const ITensor *input_loc, const int num,
-                                  const int num_priors, const int num_loc_classes,
-                                  const bool share_location, std::vector<LabelBBox> &all_location_predictions)
+void retrieve_all_loc_predictions(const ITensor          *input_loc,
+                                  const int               num,
+                                  const int               num_priors,
+                                  const int               num_loc_classes,
+                                  const bool              share_location,
+                                  std::vector<LabelBBox> &all_location_predictions)
 {
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_loc_classes; ++c)
+        for (int c = 0; c < num_loc_classes; ++c)
         {
             int label = share_location ? -1 : c;
-            if(all_location_predictions[i].find(label) == all_location_predictions[i].end())
+            if (all_location_predictions[i].find(label) == all_location_predictions[i].end())
             {
                 all_location_predictions[i][label].resize(num_priors);
             }
@@ -102,19 +114,23 @@
             }
         }
     }
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int p = 0; p < num_priors; ++p)
+        for (int p = 0; p < num_priors; ++p)
         {
-            for(int c = 0; c < num_loc_classes; ++c)
+            for (int c = 0; c < num_loc_classes; ++c)
             {
                 const int label    = share_location ? -1 : c;
                 const int base_ptr = i * num_priors * num_loc_classes * 4 + p * num_loc_classes * 4 + c * 4;
                 //xmin, ymin, xmax, ymax
-                all_location_predictions[i][label][p][0] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
-                all_location_predictions[i][label][p][1] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
-                all_location_predictions[i][label][p][2] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
-                all_location_predictions[i][label][p][3] = *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
+                all_location_predictions[i][label][p][0] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr)));
+                all_location_predictions[i][label][p][1] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 1)));
+                all_location_predictions[i][label][p][2] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 2)));
+                all_location_predictions[i][label][p][3] =
+                    *reinterpret_cast<float *>(input_loc->ptr_to_element(Coordinates(base_ptr + 3)));
             }
         }
     }
@@ -130,26 +146,28 @@
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_conf_scores(const ITensor *input_conf, const int num,
-                              const int num_priors, const int                 num_classes,
+void retrieve_all_conf_scores(const ITensor                                  *input_conf,
+                              const int                                       num,
+                              const int                                       num_priors,
+                              const int                                       num_classes,
                               std::vector<std::map<int, std::vector<float>>> &all_confidence_scores)
 {
     std::vector<float> tmp_buffer;
     tmp_buffer.resize(num * num_priors * num_classes);
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_classes; ++c)
+        for (int c = 0; c < num_classes; ++c)
         {
-            for(int p = 0; p < num_priors; ++p)
+            for (int p = 0; p < num_priors; ++p)
             {
-                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] =
-                    *reinterpret_cast<float *>(input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
+                tmp_buffer[i * num_classes * num_priors + c * num_priors + p] = *reinterpret_cast<float *>(
+                    input_conf->ptr_to_element(Coordinates(i * num_classes * num_priors + p * num_classes + c)));
             }
         }
     }
-    for(int i = 0; i < num; ++i)
+    for (int i = 0; i < num; ++i)
     {
-        for(int c = 0; c < num_classes; ++c)
+        for (int c = 0; c < num_classes; ++c)
         {
             all_confidence_scores[i][c].resize(num_priors);
             all_confidence_scores[i][c].assign(&tmp_buffer[i * num_classes * num_priors + c * num_priors],
@@ -168,28 +186,23 @@
  * @param[out] all_location_predictions All the location predictions.
  *
  */
-void retrieve_all_priorbox(const ITensor     *input_priorbox,
-                           const int          num_priors,
-                           std::vector<BBox> &all_prior_bboxes,
+void retrieve_all_priorbox(const ITensor                     *input_priorbox,
+                           const int                          num_priors,
+                           std::vector<BBox>                 &all_prior_bboxes,
                            std::vector<std::array<float, 4>> &all_prior_variances)
 {
-    for(int i = 0; i < num_priors; ++i)
+    for (int i = 0; i < num_priors; ++i)
     {
-        all_prior_bboxes[i] =
-        {
-            {
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
-                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))
-            }
-        };
+        all_prior_bboxes[i] = {{*reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 1))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 2))),
+                                *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates(i * 4 + 3)))}};
     }
 
-    std::array<float, 4> var({ { 0, 0, 0, 0 } });
-    for(int i = 0; i < num_priors; ++i)
+    std::array<float, 4> var({{0, 0, 0, 0}});
+    for (int i = 0; i < num_priors; ++i)
     {
-        for(int j = 0; j < 4; ++j)
+        for (int j = 0; j < 4; ++j)
         {
             var[j] = *reinterpret_cast<float *>(input_priorbox->ptr_to_element(Coordinates((num_priors + i) * 4 + j)));
         }
@@ -208,13 +221,17 @@
  * @param[out] decode_bbox                The decoded bboxes.
  *
  */
-void DecodeBBox(const BBox &prior_bbox, const std::array<float, 4> &prior_variance,
-                const DetectionOutputLayerCodeType code_type, const bool variance_encoded_in_target,
-                const bool clip_bbox, const BBox &bbox, BBox &decode_bbox)
+void DecodeBBox(const BBox                        &prior_bbox,
+                const std::array<float, 4>        &prior_variance,
+                const DetectionOutputLayerCodeType code_type,
+                const bool                         variance_encoded_in_target,
+                const bool                         clip_bbox,
+                const BBox                        &bbox,
+                BBox                              &decode_bbox)
 {
     // if the variance is encoded in target, we simply need to add the offset predictions
     // otherwise we need to scale the offset accordingly.
-    switch(code_type)
+    switch (code_type)
     {
         case DetectionOutputLayerCodeType::CORNER:
         {
@@ -237,10 +254,14 @@
             const float prior_center_x = (prior_bbox[0] + prior_bbox[2]) / 2.;
             const float prior_center_y = (prior_bbox[1] + prior_bbox[3]) / 2.;
 
-            const float decode_bbox_center_x = (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
-            const float decode_bbox_center_y = (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
-            const float decode_bbox_width    = (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
-            const float decode_bbox_height   = (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
+            const float decode_bbox_center_x =
+                (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width + prior_center_x;
+            const float decode_bbox_center_y =
+                (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height + prior_center_y;
+            const float decode_bbox_width =
+                (variance_encoded_in_target ? std::exp(bbox[2]) : std::exp(prior_variance[2] * bbox[2])) * prior_width;
+            const float decode_bbox_height =
+                (variance_encoded_in_target ? std::exp(bbox[3]) : std::exp(prior_variance[3] * bbox[3])) * prior_height;
 
             decode_bbox[0] = (decode_bbox_center_x - decode_bbox_width / 2.f);
             decode_bbox[1] = (decode_bbox_center_y - decode_bbox_height / 2.f);
@@ -258,10 +279,14 @@
             ARM_COMPUTE_ERROR_ON(prior_width <= 0.f);
             ARM_COMPUTE_ERROR_ON(prior_height <= 0.f);
 
-            decode_bbox[0] = prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
-            decode_bbox[1] = prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
-            decode_bbox[2] = prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
-            decode_bbox[3] = prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
+            decode_bbox[0] =
+                prior_bbox[0] + (variance_encoded_in_target ? bbox[0] : prior_variance[0] * bbox[0]) * prior_width;
+            decode_bbox[1] =
+                prior_bbox[1] + (variance_encoded_in_target ? bbox[1] : prior_variance[1] * bbox[1]) * prior_height;
+            decode_bbox[2] =
+                prior_bbox[2] + (variance_encoded_in_target ? bbox[2] : prior_variance[2] * bbox[2]) * prior_width;
+            decode_bbox[3] =
+                prior_bbox[3] + (variance_encoded_in_target ? bbox[3] : prior_variance[3] * bbox[3]) * prior_height;
 
             break;
         }
@@ -269,9 +294,9 @@
             ARM_COMPUTE_ERROR("Unsupported Detection Output Code Type.");
     }
 
-    if(clip_bbox)
+    if (clip_bbox)
     {
-        for(auto &d_bbox : decode_bbox)
+        for (auto &d_bbox : decode_bbox)
         {
             d_bbox = utility::clamp(d_bbox, 0.f, 1.f);
         }
@@ -289,10 +314,13 @@
  * @param[out] indices         The kept indices of bboxes after nms.
  *
  */
-void ApplyNMSFast(const std::vector<BBox> &bboxes,
-                  const std::vector<float> &scores, const float score_threshold,
-                  const float nms_threshold, const float eta, const int top_k,
-                  std::vector<int> &indices)
+void ApplyNMSFast(const std::vector<BBox>  &bboxes,
+                  const std::vector<float> &scores,
+                  const float               score_threshold,
+                  const float               nms_threshold,
+                  const float               eta,
+                  const int                 top_k,
+                  std::vector<int>         &indices)
 {
     ARM_COMPUTE_ERROR_ON_MSG(bboxes.size() != scores.size(), "bboxes and scores have different size.");
 
@@ -300,9 +328,9 @@
     std::list<std::pair<float, int>> score_index_vec;
 
     // Generate index score pairs.
-    for(size_t i = 0; i < scores.size(); ++i)
+    for (size_t i = 0; i < scores.size(); ++i)
     {
-        if(scores[i] > score_threshold)
+        if (scores[i] > score_threshold)
         {
             score_index_vec.emplace_back(std::make_pair(scores[i], i));
         }
@@ -313,7 +341,7 @@
 
     // Keep top_k scores if needed.
     const int score_index_vec_size = score_index_vec.size();
-    if(top_k > -1 && top_k < score_index_vec_size)
+    if (top_k > -1 && top_k < score_index_vec_size)
     {
         score_index_vec.resize(top_k);
     }
@@ -322,46 +350,45 @@
     float adaptive_threshold = nms_threshold;
     indices.clear();
 
-    while(!score_index_vec.empty())
+    while (!score_index_vec.empty())
     {
         const int idx  = score_index_vec.front().second;
         bool      keep = true;
-        for(int kept_idx : indices)
+        for (int kept_idx : indices)
         {
-            if(keep)
+            if (keep)
             {
                 // Compute the jaccard (intersection over union IoU) overlap between two bboxes.
-                BBox intersect_bbox = std::array<float, 4>({ 0, 0, 0, 0 });
-                if(bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] || bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
+                BBox intersect_bbox = std::array<float, 4>({0, 0, 0, 0});
+                if (bboxes[kept_idx][0] > bboxes[idx][2] || bboxes[kept_idx][2] < bboxes[idx][0] ||
+                    bboxes[kept_idx][1] > bboxes[idx][3] || bboxes[kept_idx][3] < bboxes[idx][1])
                 {
-                    intersect_bbox = std::array<float, 4>({ { 0, 0, 0, 0 } });
+                    intersect_bbox = std::array<float, 4>({{0, 0, 0, 0}});
                 }
                 else
                 {
-                    intersect_bbox = std::array<float, 4>({ {
-                            std::max(bboxes[idx][0], bboxes[kept_idx][0]),
-                            std::max(bboxes[idx][1], bboxes[kept_idx][1]),
-                            std::min(bboxes[idx][2], bboxes[kept_idx][2]),
-                            std::min(bboxes[idx][3], bboxes[kept_idx][3])
-                        }
-                    });
+                    intersect_bbox = std::array<float, 4>(
+                        {{std::max(bboxes[idx][0], bboxes[kept_idx][0]), std::max(bboxes[idx][1], bboxes[kept_idx][1]),
+                          std::min(bboxes[idx][2], bboxes[kept_idx][2]),
+                          std::min(bboxes[idx][3], bboxes[kept_idx][3])}});
                 }
 
                 float intersect_width  = intersect_bbox[2] - intersect_bbox[0];
                 float intersect_height = intersect_bbox[3] - intersect_bbox[1];
 
                 float overlap = 0.f;
-                if(intersect_width > 0 && intersect_height > 0)
+                if (intersect_width > 0 && intersect_height > 0)
                 {
                     float intersect_size = intersect_width * intersect_height;
-                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0]
-                                            || bboxes[idx][3] < bboxes[idx][1]) ?
-                                           0.f :
-                                           (bboxes[idx][2] - bboxes[idx][0]) * (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
-                    float bbox2_size = (bboxes[kept_idx][2] < bboxes[kept_idx][0]
-                                        || bboxes[kept_idx][3] < bboxes[kept_idx][1]) ?
-                                       0.f :
-                                       (bboxes[kept_idx][2] - bboxes[kept_idx][0]) * (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
+                    float bbox1_size     = (bboxes[idx][2] < bboxes[idx][0] || bboxes[idx][3] < bboxes[idx][1])
+                                               ? 0.f
+                                               : (bboxes[idx][2] - bboxes[idx][0]) *
+                                                 (bboxes[idx][3] - bboxes[idx][1]); //BBoxSize(bboxes[idx]);
+                    float bbox2_size =
+                        (bboxes[kept_idx][2] < bboxes[kept_idx][0] || bboxes[kept_idx][3] < bboxes[kept_idx][1])
+                            ? 0.f
+                            : (bboxes[kept_idx][2] - bboxes[kept_idx][0]) *
+                                  (bboxes[kept_idx][3] - bboxes[kept_idx][1]); // BBoxSize(bboxes[kept_idx]);
                     overlap = intersect_size / (bbox1_size + bbox2_size - intersect_size);
                 }
                 keep = (overlap <= adaptive_threshold);
@@ -371,12 +398,12 @@
                 break;
             }
         }
-        if(keep)
+        if (keep)
         {
             indices.push_back(idx);
         }
         score_index_vec.erase(score_index_vec.begin());
-        if(keep && eta < 1.f && adaptive_threshold > 0.5f)
+        if (keep && eta < 1.f && adaptive_threshold > 0.5f)
         {
             adaptive_threshold *= eta;
         }
@@ -385,13 +412,27 @@
 } // namespace
 
 CPPDetectionOutputLayer::CPPDetectionOutputLayer()
-    : _input_loc(nullptr), _input_conf(nullptr), _input_priorbox(nullptr), _output(nullptr), _info(), _num_priors(), _num(), _all_location_predictions(), _all_confidence_scores(), _all_prior_bboxes(),
-      _all_prior_variances(), _all_decode_bboxes(), _all_indices()
+    : _input_loc(nullptr),
+      _input_conf(nullptr),
+      _input_priorbox(nullptr),
+      _output(nullptr),
+      _info(),
+      _num_priors(),
+      _num(),
+      _all_location_predictions(),
+      _all_confidence_scores(),
+      _all_prior_bboxes(),
+      _all_prior_variances(),
+      _all_decode_bboxes(),
+      _all_indices()
 {
 }
 
-void CPPDetectionOutputLayer::configure(const ITensor *input_loc, const ITensor *input_conf, const ITensor *input_priorbox,
-                                        ITensor *output, DetectionOutputLayerInfo info)
+void CPPDetectionOutputLayer::configure(const ITensor           *input_loc,
+                                        const ITensor           *input_conf,
+                                        const ITensor           *input_priorbox,
+                                        ITensor                 *output,
+                                        DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input_loc, input_conf, input_priorbox, output);
     ARM_COMPUTE_LOG_PARAMS(input_loc, input_conf, input_priorbox, output, info);
@@ -400,11 +441,13 @@
     // Since the number of bboxes to kept is unknown before nms, the shape is set to the maximum
     // The maximum is keep_top_k * input_loc_size[1]
     // Each row is a 7 dimension std::vector, which stores [image_id, label, confidence, xmin, ymin, xmax, ymax]
-    const unsigned int max_size = info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
+    const unsigned int max_size =
+        info.keep_top_k() * (input_loc->info()->num_dimensions() > 1 ? input_loc->info()->dimension(1) : 1);
     auto_init_if_empty(*output->info(), input_loc->info()->clone()->set_tensor_shape(TensorShape(7U, max_size)));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        validate_arguments(input_loc->info(), input_conf->info(), input_priorbox->info(), output->info(), info));
 
     _input_loc      = input_loc;
     _input_conf     = input_conf;
@@ -420,12 +463,12 @@
     _all_prior_variances.resize(_num_priors);
     _all_decode_bboxes.resize(_num);
 
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        for (int c = 0; c < _info.num_loc_classes(); ++c)
         {
             const int label = _info.share_location() ? -1 : c;
-            if(label == _info.background_label_id())
+            if (label == _info.background_label_id())
             {
                 // Ignore background class.
                 continue;
@@ -440,7 +483,11 @@
     output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
 }
 
-Status CPPDetectionOutputLayer::validate(const ITensorInfo *input_loc, const ITensorInfo *input_conf, const ITensorInfo *input_priorbox, const ITensorInfo *output, DetectionOutputLayerInfo info)
+Status CPPDetectionOutputLayer::validate(const ITensorInfo       *input_loc,
+                                         const ITensorInfo       *input_conf,
+                                         const ITensorInfo       *input_priorbox,
+                                         const ITensorInfo       *output,
+                                         DetectionOutputLayerInfo info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_loc, input_conf, input_priorbox, output, info));
     return Status{};
@@ -449,7 +496,8 @@
 void CPPDetectionOutputLayer::run()
 {
     // Retrieve all location predictions.
-    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(), _all_location_predictions);
+    retrieve_all_loc_predictions(_input_loc, _num, _num_priors, _info.num_loc_classes(), _info.share_location(),
+                                 _all_location_predictions);
 
     // Retrieve all confidences.
     retrieve_all_conf_scores(_input_conf, _num, _num_priors, _info.num_classes(), _all_confidence_scores);
@@ -459,75 +507,79 @@
 
     // Decode all loc predictions to bboxes
     const bool clip_bbox = false;
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        for(int c = 0; c < _info.num_loc_classes(); ++c)
+        for (int c = 0; c < _info.num_loc_classes(); ++c)
         {
             const int label = _info.share_location() ? -1 : c;
-            if(label == _info.background_label_id())
+            if (label == _info.background_label_id())
             {
                 // Ignore background class.
                 continue;
             }
-            ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(), "Could not find location predictions for label %d.", label);
+            ARM_COMPUTE_ERROR_ON_MSG_VAR(_all_location_predictions[i].find(label) == _all_location_predictions[i].end(),
+                                         "Could not find location predictions for label %d.", label);
 
             const std::vector<BBox> &label_loc_preds = _all_location_predictions[i].find(label)->second;
 
             const int num_bboxes = _all_prior_bboxes.size();
             ARM_COMPUTE_ERROR_ON(_all_prior_variances[i].size() != 4);
 
-            for(int j = 0; j < num_bboxes; ++j)
+            for (int j = 0; j < num_bboxes; ++j)
             {
-                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(), _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j], _all_decode_bboxes[i][label][j]);
+                DecodeBBox(_all_prior_bboxes[j], _all_prior_variances[j], _info.code_type(),
+                           _info.variance_encoded_in_target(), clip_bbox, label_loc_preds[j],
+                           _all_decode_bboxes[i][label][j]);
             }
         }
     }
 
     int num_kept = 0;
 
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
-        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
+        const LabelBBox                         &decode_bboxes = _all_decode_bboxes[i];
+        const std::map<int, std::vector<float>> &conf_scores   = _all_confidence_scores[i];
 
         std::map<int, std::vector<int>> indices;
-        int num_det = 0;
-        for(int c = 0; c < _info.num_classes(); ++c)
+        int                             num_det = 0;
+        for (int c = 0; c < _info.num_classes(); ++c)
         {
-            if(c == _info.background_label_id())
+            if (c == _info.background_label_id())
             {
                 // Ignore background class
                 continue;
             }
             const int label = _info.share_location() ? -1 : c;
-            if(conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
+            if (conf_scores.find(c) == conf_scores.end() || decode_bboxes.find(label) == decode_bboxes.end())
             {
                 ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
             }
             const std::vector<float> &scores = conf_scores.find(c)->second;
-            const std::vector<BBox> &bboxes = decode_bboxes.find(label)->second;
+            const std::vector<BBox>  &bboxes = decode_bboxes.find(label)->second;
 
-            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(), _info.top_k(), indices[c]);
+            ApplyNMSFast(bboxes, scores, _info.confidence_threshold(), _info.nms_threshold(), _info.eta(),
+                         _info.top_k(), indices[c]);
 
             num_det += indices[c].size();
         }
 
         int num_to_add = 0;
-        if(_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
+        if (_info.keep_top_k() > -1 && num_det > _info.keep_top_k())
         {
             std::vector<std::pair<float, std::pair<int, int>>> score_index_pairs;
-            for(auto const &it : indices)
+            for (auto const &it : indices)
             {
                 const int               label         = it.first;
                 const std::vector<int> &label_indices = it.second;
 
-                if(conf_scores.find(label) == conf_scores.end())
+                if (conf_scores.find(label) == conf_scores.end())
                 {
                     ARM_COMPUTE_ERROR_VAR("Could not find predictions for label %d.", label);
                 }
 
                 const std::vector<float> &scores = conf_scores.find(label)->second;
-                for(auto idx : label_indices)
+                for (auto idx : label_indices)
                 {
                     ARM_COMPUTE_ERROR_ON(idx > static_cast<int>(scores.size()));
                     score_index_pairs.emplace_back(std::make_pair(scores[idx], std::make_pair(label, idx)));
@@ -541,7 +593,7 @@
             // Store the new indices.
 
             std::map<int, std::vector<int>> new_indices;
-            for(auto score_index_pair : score_index_pairs)
+            for (auto score_index_pair : score_index_pairs)
             {
                 int label = score_index_pair.second.first;
                 int idx   = score_index_pair.second.second;
@@ -562,25 +614,25 @@
     _output->info()->set_valid_region(ValidRegion(Coordinates(0, 0), TensorShape(7, num_kept)));
 
     int count = 0;
-    for(int i = 0; i < _num; ++i)
+    for (int i = 0; i < _num; ++i)
     {
-        const std::map<int, std::vector<float>> &conf_scores = _all_confidence_scores[i];
-        const LabelBBox &decode_bboxes = _all_decode_bboxes[i];
-        for(auto &it : _all_indices[i])
+        const std::map<int, std::vector<float>> &conf_scores   = _all_confidence_scores[i];
+        const LabelBBox                         &decode_bboxes = _all_decode_bboxes[i];
+        for (auto &it : _all_indices[i])
         {
             const int                 label     = it.first;
             const std::vector<float> &scores    = conf_scores.find(label)->second;
             const int                 loc_label = _info.share_location() ? -1 : label;
-            if(conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
+            if (conf_scores.find(label) == conf_scores.end() || decode_bboxes.find(loc_label) == decode_bboxes.end())
             {
                 // Either if there are no confidence predictions
                 // or there are no location predictions for current label.
                 ARM_COMPUTE_ERROR_VAR("Could not find predictions for the label %d.", label);
             }
             const std::vector<BBox> &bboxes  = decode_bboxes.find(loc_label)->second;
-            const std::vector<int> &indices = it.second;
+            const std::vector<int>  &indices = it.second;
 
-            for(auto idx : indices)
+            for (auto idx : indices)
             {
                 *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7))))     = i;
                 *(reinterpret_cast<float *>(_output->ptr_to_element(Coordinates(count * 7 + 1)))) = label;

diff --git a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
index ecbc49b..2861d6c 100644
--- a/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp
+++ b/src/runtime/CPP/functions/CPPDetectionPostProcessLayer.cpp

@@ -26,9 +26,9 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 
 #include <cstddef>
 #include <ios>
@@ -38,53 +38,76 @@
 {
 namespace
 {
-Status validate_arguments(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                          ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection,
-                          DetectionPostProcessLayerInfo info, const unsigned int kBatchSize, const unsigned int kNumCoordBox)
+Status validate_arguments(const ITensorInfo            *input_box_encoding,
+                          const ITensorInfo            *input_class_score,
+                          const ITensorInfo            *input_anchors,
+                          ITensorInfo                  *output_boxes,
+                          ITensorInfo                  *output_classes,
+                          ITensorInfo                  *output_scores,
+                          ITensorInfo                  *num_detection,
+                          DetectionPostProcessLayerInfo info,
+                          const unsigned int            kBatchSize,
+                          const unsigned int            kNumCoordBox)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input_box_encoding, input_class_score, input_anchors);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_box_encoding, 1, DataType::F32, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_box_encoding, input_anchors);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3, "The location input tensor shape should be [4, N, kBatchSize].");
-    if(input_box_encoding->num_dimensions() > 2)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_box_encoding->num_dimensions() > 3,
+                                    "The location input tensor shape should be [4, N, kBatchSize].");
+    if (input_box_encoding->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(2) != kBatchSize, "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(
+            input_box_encoding->dimension(2) != kBatchSize,
+            "The third dimension of the input box_encoding tensor should be equal to %d.", kBatchSize);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox, "The first dimension of the input box_encoding tensor should be equal to %d.", kNumCoordBox);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_class_score->dimension(0) != (info.num_classes() + 1),
-                                    "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_box_encoding->dimension(0) != kNumCoordBox,
+                                        "The first dimension of the input box_encoding tensor should be equal to %d.",
+                                        kNumCoordBox);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(
+        input_class_score->dimension(0) != (info.num_classes() + 1),
+        "The first dimension of the input class_prediction should be equal to the number of classes plus one.");
 
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3, "The anchors input tensor shape should be [4, N, kBatchSize].");
-    if(input_anchors->num_dimensions() > 2)
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input_anchors->num_dimensions() > 3,
+                                    "The anchors input tensor shape should be [4, N, kBatchSize].");
+    if (input_anchors->num_dimensions() > 2)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox, "The first dimension of the input anchors tensor should be equal to %d.", kNumCoordBox);
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR(input_anchors->dimension(0) != kNumCoordBox,
+                                            "The first dimension of the input anchors tensor should be equal to %d.",
+                                            kNumCoordBox);
     }
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1))
-                                    || (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((input_box_encoding->dimension(1) != input_class_score->dimension(1)) ||
+                                        (input_box_encoding->dimension(1) != input_anchors->dimension(1)),
                                     "The second dimension of the inputs should be the same.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1, "The num_detection output tensor shape should be [M].");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f), "The intersection over union should be positive and less than 1.");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0, "The number of max classes per detection should be positive.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_detection->num_dimensions() > 1,
+                                    "The num_detection output tensor shape should be [M].");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((info.iou_threshold() <= 0.0f) || (info.iou_threshold() > 1.0f),
+                                    "The intersection over union should be positive and less than 1.");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.max_classes_per_detection() <= 0,
+                                    "The number of max classes per detection should be positive.");
 
     const unsigned int num_detected_boxes = info.max_detections() * info.max_classes_per_detection();
 
     // Validate configured outputs
-    if(output_boxes->total_size() != 0)
+    if (output_boxes->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(), TensorShape(4U, num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_boxes->tensor_shape(),
+                                                           TensorShape(4U, num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_boxes, 1, DataType::F32);
     }
-    if(output_classes->total_size() != 0)
+    if (output_classes->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_classes->tensor_shape(),
+                                                           TensorShape(num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_classes, 1, DataType::F32);
     }
-    if(output_scores->total_size() != 0)
+    if (output_scores->total_size() != 0)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(), TensorShape(num_detected_boxes, 1U));
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output_scores->tensor_shape(),
+                                                           TensorShape(num_detected_boxes, 1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_scores, 1, DataType::F32);
     }
-    if(num_detection->total_size() != 0)
+    if (num_detection->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(num_detection->tensor_shape(), TensorShape(1U));
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_detection, 1, DataType::F32);
@@ -93,15 +116,18 @@
     return Status{};
 }
 
-inline void DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
+inline void
+DecodeBoxCorner(BBox &box_centersize, BBox &anchor, Iterator &decoded_it, DetectionPostProcessLayerInfo info)
 {
     const float half_factor = 0.5f;
 
     // BBox is equavalent to CenterSizeEncoding [y,x,h,w]
     const float y_center = box_centersize[0] / info.scale_value_y() * anchor[2] + anchor[0];
     const float x_center = box_centersize[1] / info.scale_value_x() * anchor[3] + anchor[1];
-    const float half_h   = half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
-    const float half_w   = half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
+    const float half_h =
+        half_factor * static_cast<float>(std::exp(box_centersize[2] / info.scale_value_h())) * anchor[2];
+    const float half_w =
+        half_factor * static_cast<float>(std::exp(box_centersize[3] / info.scale_value_w())) * anchor[3];
 
     // Box Corner encoding boxes are saved as [xmin, ymin, xmax, ymax]
     auto decoded_ptr   = reinterpret_cast<float *>(decoded_it.ptr());
@@ -118,12 +144,15 @@
  * @param[in]  info               The detection informations
  * @param[out] decoded_boxes      The decoded bboxes.
  */
-void DecodeCenterSizeBoxes(const ITensor *input_box_encoding, const ITensor *input_anchors, DetectionPostProcessLayerInfo info, Tensor *decoded_boxes)
+void DecodeCenterSizeBoxes(const ITensor                *input_box_encoding,
+                           const ITensor                *input_anchors,
+                           DetectionPostProcessLayerInfo info,
+                           Tensor                       *decoded_boxes)
 {
     const QuantizationInfo &qi_box     = input_box_encoding->info()->quantization_info();
     const QuantizationInfo &qi_anchors = input_anchors->info()->quantization_info();
-    BBox                    box_centersize{ {} };
-    BBox                    anchor{ {} };
+    BBox                    box_centersize{{}};
+    BBox                    anchor{{}};
 
     Window win;
     win.use_tensor_dimensions(input_box_encoding->info()->tensor_shape());
@@ -133,107 +162,155 @@
     Iterator anchor_it(input_anchors, win);
     Iterator decoded_it(decoded_boxes, win);
 
-    if(input_box_encoding->info()->data_type() == DataType::QASYMM8)
+    if (input_box_encoding->info()->data_type() == DataType::QASYMM8)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
-            box_centersize        = BBox({ dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
-                                           dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)
-                                         });
-            anchor = BBox({ dequantize_qasymm8(*anchor_ptr, qi_anchors), dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
-                            dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)
-                          });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const qasymm8_t *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const qasymm8_t *>(anchor_it.ptr());
+                box_centersize =
+                    BBox({dequantize_qasymm8(*box_ptr, qi_box), dequantize_qasymm8(*(box_ptr + 1), qi_box),
+                          dequantize_qasymm8(*(2 + box_ptr), qi_box), dequantize_qasymm8(*(3 + box_ptr), qi_box)});
+                anchor = BBox({dequantize_qasymm8(*anchor_ptr, qi_anchors),
+                               dequantize_qasymm8(*(anchor_ptr + 1), qi_anchors),
+                               dequantize_qasymm8(*(2 + anchor_ptr), qi_anchors),
+                               dequantize_qasymm8(*(3 + anchor_ptr), qi_anchors)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
-    else if(input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+    else if (input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
-            box_centersize        = BBox({ dequantize_qasymm8_signed(*box_ptr, qi_box), dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
-                                           dequantize_qasymm8_signed(*(2 + box_ptr), qi_box), dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)
-                                         });
-            anchor = BBox({ dequantize_qasymm8_signed(*anchor_ptr, qi_anchors), dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
-                            dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors), dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)
-                          });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const qasymm8_signed_t *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const qasymm8_signed_t *>(anchor_it.ptr());
+                box_centersize        = BBox({dequantize_qasymm8_signed(*box_ptr, qi_box),
+                                              dequantize_qasymm8_signed(*(box_ptr + 1), qi_box),
+                                              dequantize_qasymm8_signed(*(2 + box_ptr), qi_box),
+                                              dequantize_qasymm8_signed(*(3 + box_ptr), qi_box)});
+                anchor                = BBox({dequantize_qasymm8_signed(*anchor_ptr, qi_anchors),
+                                              dequantize_qasymm8_signed(*(anchor_ptr + 1), qi_anchors),
+                                              dequantize_qasymm8_signed(*(2 + anchor_ptr), qi_anchors),
+                                              dequantize_qasymm8_signed(*(3 + anchor_ptr), qi_anchors)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
     else
     {
-        execute_window_loop(win, [&](const Coordinates &)
-        {
-            const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
-            const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
-            box_centersize        = BBox({ *box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr) });
-            anchor                = BBox({ *anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr) });
-            DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
-        },
-        box_it, anchor_it, decoded_it);
+        execute_window_loop(
+            win,
+            [&](const Coordinates &)
+            {
+                const auto box_ptr    = reinterpret_cast<const float *>(box_it.ptr());
+                const auto anchor_ptr = reinterpret_cast<const float *>(anchor_it.ptr());
+                box_centersize        = BBox({*box_ptr, *(box_ptr + 1), *(2 + box_ptr), *(3 + box_ptr)});
+                anchor                = BBox({*anchor_ptr, *(anchor_ptr + 1), *(2 + anchor_ptr), *(3 + anchor_ptr)});
+                DecodeBoxCorner(box_centersize, anchor, decoded_it, info);
+            },
+            box_it, anchor_it, decoded_it);
     }
 }
 
-void SaveOutputs(const Tensor *decoded_boxes, const std::vector<int> &result_idx_boxes_after_nms, const std::vector<float> &result_scores_after_nms, const std::vector<int> &result_classes_after_nms,
-                 std::vector<unsigned int> &sorted_indices, const unsigned int num_output, const unsigned int max_detections, ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores,
-                 ITensor *num_detection)
+void SaveOutputs(const Tensor              *decoded_boxes,
+                 const std::vector<int>    &result_idx_boxes_after_nms,
+                 const std::vector<float>  &result_scores_after_nms,
+                 const std::vector<int>    &result_classes_after_nms,
+                 std::vector<unsigned int> &sorted_indices,
+                 const unsigned int         num_output,
+                 const unsigned int         max_detections,
+                 ITensor                   *output_boxes,
+                 ITensor                   *output_classes,
+                 ITensor                   *output_scores,
+                 ITensor                   *num_detection)
 {
     // xmin,ymin,xmax,ymax -> ymin,xmin,ymax,xmax
     unsigned int i = 0;
-    for(; i < num_output; ++i)
+    for (; i < num_output; ++i)
     {
         const unsigned int box_in_idx = result_idx_boxes_after_nms[sorted_indices[i]];
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
-        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
-        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
-        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = result_scores_after_nms[sorted_indices[i]];
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(1, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(0, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(3, box_in_idx))));
+        *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) =
+            *(reinterpret_cast<float *>(decoded_boxes->ptr_to_element(Coordinates(2, box_in_idx))));
+        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) =
+            static_cast<float>(result_classes_after_nms[sorted_indices[i]]);
+        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i)))) =
+            result_scores_after_nms[sorted_indices[i]];
     }
-    for(; i < max_detections; ++i)
+    for (; i < max_detections; ++i)
     {
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(1, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(0, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(3, i)))) = 0.0f;
         *(reinterpret_cast<float *>(output_boxes->ptr_to_element(Coordinates(2, i)))) = 0.0f;
-        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i)))) = 0.0f;
-        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))  = 0.0f;
+        *(reinterpret_cast<float *>(output_classes->ptr_to_element(Coordinates(i))))  = 0.0f;
+        *(reinterpret_cast<float *>(output_scores->ptr_to_element(Coordinates(i))))   = 0.0f;
     }
     *(reinterpret_cast<float *>(num_detection->ptr_to_element(Coordinates(0)))) = num_output;
 }
 } // namespace
 
 CPPDetectionPostProcessLayer::CPPDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _nms(), _input_box_encoding(nullptr), _input_scores(nullptr), _input_anchors(nullptr), _output_boxes(nullptr), _output_classes(nullptr),
-      _output_scores(nullptr), _num_detection(nullptr), _info(), _num_boxes(), _num_classes_with_background(), _num_max_detected_boxes(), _dequantize_scores(false), _decoded_boxes(), _decoded_scores(),
-      _selected_indices(), _class_scores(), _input_scores_to_use(nullptr)
+    : _memory_group(std::move(memory_manager)),
+      _nms(),
+      _input_box_encoding(nullptr),
+      _input_scores(nullptr),
+      _input_anchors(nullptr),
+      _output_boxes(nullptr),
+      _output_classes(nullptr),
+      _output_scores(nullptr),
+      _num_detection(nullptr),
+      _info(),
+      _num_boxes(),
+      _num_classes_with_background(),
+      _num_max_detected_boxes(),
+      _dequantize_scores(false),
+      _decoded_boxes(),
+      _decoded_scores(),
+      _selected_indices(),
+      _class_scores(),
+      _input_scores_to_use(nullptr)
 {
 }
 
-void CPPDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores,
-                                             const ITensor *input_anchors, ITensor *output_boxes, ITensor *output_classes,
-                                             ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void CPPDetectionPostProcessLayer::configure(const ITensor                *input_box_encoding,
+                                             const ITensor                *input_scores,
+                                             const ITensor                *input_anchors,
+                                             ITensor                      *output_boxes,
+                                             ITensor                      *output_classes,
+                                             ITensor                      *output_scores,
+                                             ITensor                      *num_detection,
+                                             DetectionPostProcessLayerInfo info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+                                 output_scores);
     ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
                            num_detection, info);
 
     _num_max_detected_boxes = info.max_detections() * info.max_classes_per_detection();
 
-    auto_init_if_empty(*output_boxes->info(), TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*output_classes->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*output_scores->info(), TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_boxes->info(),
+                       TensorInfo(TensorShape(_kNumCoordBox, _num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_classes->info(),
+                       TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
+    auto_init_if_empty(*output_scores->info(),
+                       TensorInfo(TensorShape(_num_max_detected_boxes, _kBatchSize), 1, DataType::F32));
     auto_init_if_empty(*num_detection->info(), TensorInfo(TensorShape(1U), 1, DataType::F32));
 
     // Perform validation step
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(), output_scores->info(),
-                                                  num_detection->info(),
-                                                  info, _kBatchSize, _kNumCoordBox));
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(
+        input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+        output_classes->info(), output_scores->info(), num_detection->info(), info, _kBatchSize, _kNumCoordBox));
 
     _input_box_encoding          = input_box_encoding;
     _input_scores                = input_scores;
@@ -245,13 +322,24 @@
     _info                        = info;
     _num_boxes                   = input_box_encoding->info()->dimension(1);
     _num_classes_with_background = _input_scores->info()->dimension(0);
-    _dequantize_scores           = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
+    _dequantize_scores = (info.dequantize_scores() && is_data_type_quantized(input_box_encoding->info()->data_type()));
 
-    auto_init_if_empty(*_decoded_boxes.info(), TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*_decoded_scores.info(), TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize), 1, DataType::F32));
-    auto_init_if_empty(*_selected_indices.info(), TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1, DataType::S32));
+    auto_init_if_empty(*_decoded_boxes.info(),
+                       TensorInfo(TensorShape(_kNumCoordBox, _input_box_encoding->info()->dimension(1), _kBatchSize), 1,
+                                  DataType::F32));
+    auto_init_if_empty(
+        *_decoded_scores.info(),
+        TensorInfo(TensorShape(_input_scores->info()->dimension(0), _input_scores->info()->dimension(1), _kBatchSize),
+                   1, DataType::F32));
+    auto_init_if_empty(
+        *_selected_indices.info(),
+        TensorInfo(TensorShape(info.use_regular_nms() ? info.detection_per_class() : info.max_detections()), 1,
+                   DataType::S32));
     const unsigned int num_classes_per_box = std::min(info.max_classes_per_detection(), info.num_classes());
-    auto_init_if_empty(*_class_scores.info(), TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1, DataType::F32));
+    auto_init_if_empty(
+        *_class_scores.info(),
+        TensorInfo(info.use_regular_nms() ? TensorShape(_num_boxes) : TensorShape(_num_boxes * num_classes_per_box), 1,
+                   DataType::F32));
 
     _input_scores_to_use = _dequantize_scores ? &_decoded_scores : _input_scores;
 
@@ -260,7 +348,9 @@
     _memory_group.manage(&_decoded_scores);
     _memory_group.manage(&_selected_indices);
     _memory_group.manage(&_class_scores);
-    _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices, info.use_regular_nms() ? info.detection_per_class() : info.max_detections(), info.nms_score_threshold(), info.iou_threshold());
+    _nms.configure(&_decoded_boxes, &_class_scores, &_selected_indices,
+                   info.use_regular_nms() ? info.detection_per_class() : info.max_detections(),
+                   info.nms_score_threshold(), info.iou_threshold());
 
     // Allocate and reserve intermediate tensors and vectors
     _decoded_boxes.allocator()->allocate();
@@ -269,18 +359,28 @@
     _class_scores.allocator()->allocate();
 }
 
-Status CPPDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_class_score, const ITensorInfo *input_anchors,
-                                              ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status CPPDetectionPostProcessLayer::validate(const ITensorInfo            *input_box_encoding,
+                                              const ITensorInfo            *input_class_score,
+                                              const ITensorInfo            *input_anchors,
+                                              ITensorInfo                  *output_boxes,
+                                              ITensorInfo                  *output_classes,
+                                              ITensorInfo                  *output_scores,
+                                              ITensorInfo                  *num_detection,
+                                              DetectionPostProcessLayerInfo info)
 {
-    constexpr unsigned int kBatchSize             = 1;
-    constexpr unsigned int kNumCoordBox           = 4;
-    const TensorInfo       _decoded_boxes_info    = TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
-    const TensorInfo       _decoded_scores_info   = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
-    const TensorInfo       _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
+    constexpr unsigned int kBatchSize   = 1;
+    constexpr unsigned int kNumCoordBox = 4;
+    const TensorInfo       _decoded_boxes_info =
+        TensorInfo(TensorShape(kNumCoordBox, input_box_encoding->dimension(1)), 1, DataType::F32);
+    const TensorInfo _decoded_scores_info = TensorInfo(TensorShape(input_box_encoding->dimension(1)), 1, DataType::F32);
+    const TensorInfo _selected_indices_info = TensorInfo(TensorShape(info.max_detections()), 1, DataType::S32);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info, &_selected_indices_info, info.max_detections(), info.nms_score_threshold(),
-                                                                   info.iou_threshold()));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes, output_classes, output_scores, num_detection, info, kBatchSize, kNumCoordBox));
+    ARM_COMPUTE_RETURN_ON_ERROR(CPPNonMaximumSuppression::validate(&_decoded_boxes_info, &_decoded_scores_info,
+                                                                   &_selected_indices_info, info.max_detections(),
+                                                                   info.nms_score_threshold(), info.iou_threshold()));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input_box_encoding, input_class_score, input_anchors, output_boxes,
+                                                   output_classes, output_scores, num_detection, info, kBatchSize,
+                                                   kNumCoordBox));
 
     return Status{};
 }
@@ -293,62 +393,69 @@
     DecodeCenterSizeBoxes(_input_box_encoding, _input_anchors, _info, &_decoded_boxes);
 
     // Decode scores if necessary
-    if(_dequantize_scores)
+    if (_dequantize_scores)
     {
-        if(_input_box_encoding->info()->data_type() == DataType::QASYMM8)
+        if (_input_box_encoding->info()->data_type() == DataType::QASYMM8)
         {
-            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+            for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
             {
-                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
                 {
                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
-                        dequantize_qasymm8(*(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                        dequantize_qasymm8(
+                            *(reinterpret_cast<qasymm8_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+                            _input_scores->info()->quantization_info());
                 }
             }
         }
-        else if(_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
+        else if (_input_box_encoding->info()->data_type() == DataType::QASYMM8_SIGNED)
         {
-            for(unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
+            for (unsigned int idx_c = 0; idx_c < _num_classes_with_background; ++idx_c)
             {
-                for(unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
+                for (unsigned int idx_b = 0; idx_b < _num_boxes; ++idx_b)
                 {
                     *(reinterpret_cast<float *>(_decoded_scores.ptr_to_element(Coordinates(idx_c, idx_b)))) =
-                        dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(_input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))), _input_scores->info()->quantization_info());
+                        dequantize_qasymm8_signed(*(reinterpret_cast<qasymm8_signed_t *>(
+                                                      _input_scores->ptr_to_element(Coordinates(idx_c, idx_b)))),
+                                                  _input_scores->info()->quantization_info());
                 }
             }
         }
     }
 
     // Regular NMS
-    if(_info.use_regular_nms())
+    if (_info.use_regular_nms())
     {
         std::vector<int>          result_idx_boxes_after_nms;
         std::vector<int>          result_classes_after_nms;
         std::vector<float>        result_scores_after_nms;
         std::vector<unsigned int> sorted_indices;
 
-        for(unsigned int c = 0; c < num_classes; ++c)
+        for (unsigned int c = 0; c < num_classes; ++c)
         {
             // For each boxes get scores of the boxes for the class c
-            for(unsigned int i = 0; i < _num_boxes; ++i)
+            for (unsigned int i = 0; i < _num_boxes; ++i)
             {
                 *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(i)))) =
-                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
+                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(
+                        Coordinates(c + 1, i)))); // i * _num_classes_with_background + c + 1
             }
 
             // Run Non-maxima Suppression
             _nms.run();
 
-            for(unsigned int i = 0; i < _info.detection_per_class(); ++i)
+            for (unsigned int i = 0; i < _info.detection_per_class(); ++i)
             {
-                const auto selected_index = *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
-                if(selected_index == -1)
+                const auto selected_index =
+                    *(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i))));
+                if (selected_index == -1)
                 {
                     // Nms will return -1 for all the last M-elements not valid
                     break;
                 }
                 result_idx_boxes_after_nms.emplace_back(selected_index);
-                result_scores_after_nms.emplace_back((reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
+                result_scores_after_nms.emplace_back(
+                    (reinterpret_cast<float *>(_class_scores.buffer()))[selected_index]);
                 result_classes_after_nms.emplace_back(c);
             }
         }
@@ -360,49 +467,46 @@
         // Sort selected indices based on result scores
         sorted_indices.resize(num_selected);
         std::iota(sorted_indices.begin(), sorted_indices.end(), 0);
-        std::partial_sort(sorted_indices.data(),
-                          sorted_indices.data() + num_output,
+        std::partial_sort(sorted_indices.data(), sorted_indices.data() + num_output,
                           sorted_indices.data() + num_selected,
                           [&](unsigned int first, unsigned int second)
-        {
+                          { return result_scores_after_nms[first] > result_scores_after_nms[second]; });
 
-            return result_scores_after_nms[first] > result_scores_after_nms[second];
-        });
-
-        SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms, sorted_indices,
-                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, result_idx_boxes_after_nms, result_scores_after_nms, result_classes_after_nms,
+                    sorted_indices, num_output, max_detections, _output_boxes, _output_classes, _output_scores,
+                    _num_detection);
     }
     // Fast NMS
     else
     {
-        const unsigned int num_classes_per_box = std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
+        const unsigned int num_classes_per_box =
+            std::min<unsigned int>(_info.max_classes_per_detection(), _info.num_classes());
         std::vector<float> max_scores;
         std::vector<int>   box_indices;
         std::vector<int>   max_score_classes;
 
-        for(unsigned int b = 0; b < _num_boxes; ++b)
+        for (unsigned int b = 0; b < _num_boxes; ++b)
         {
             std::vector<float> box_scores;
-            for(unsigned int c = 0; c < num_classes; ++c)
+            for (unsigned int c = 0; c < num_classes; ++c)
             {
-                box_scores.emplace_back(*(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
+                box_scores.emplace_back(
+                    *(reinterpret_cast<float *>(_input_scores_to_use->ptr_to_element(Coordinates(c + 1, b)))));
             }
 
             std::vector<unsigned int> max_score_indices;
             max_score_indices.resize(_info.num_classes());
             std::iota(max_score_indices.data(), max_score_indices.data() + _info.num_classes(), 0);
-            std::partial_sort(max_score_indices.data(),
-                              max_score_indices.data() + num_classes_per_box,
+            std::partial_sort(max_score_indices.data(), max_score_indices.data() + num_classes_per_box,
                               max_score_indices.data() + num_classes,
                               [&](unsigned int first, unsigned int second)
-            {
-                return box_scores[first] > box_scores[second];
-            });
+                              { return box_scores[first] > box_scores[second]; });
 
-            for(unsigned int i = 0; i < num_classes_per_box; ++i)
+            for (unsigned int i = 0; i < num_classes_per_box; ++i)
             {
-                const float score_to_add                                                                             = box_scores[max_score_indices[i]];
-                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) = score_to_add;
+                const float score_to_add = box_scores[max_score_indices[i]];
+                *(reinterpret_cast<float *>(_class_scores.ptr_to_element(Coordinates(b * num_classes_per_box + i)))) =
+                    score_to_add;
                 max_scores.emplace_back(score_to_add);
                 box_indices.emplace_back(b);
                 max_score_classes.emplace_back(max_score_indices[i]);
@@ -412,10 +516,10 @@
         // Run Non-maxima Suppression
         _nms.run();
         std::vector<unsigned int> selected_indices;
-        for(unsigned int i = 0; i < max_detections; ++i)
+        for (unsigned int i = 0; i < max_detections; ++i)
         {
             // NMS returns M valid indices, the not valid tail is filled with -1
-            if(*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
+            if (*(reinterpret_cast<int *>(_selected_indices.ptr_to_element(Coordinates(i)))) == -1)
             {
                 // Nms will return -1 for all the last M-elements not valid
                 break;
@@ -425,8 +529,8 @@
         // We select the max detection numbers of the highest score of all classes
         const auto num_output = std::min<unsigned int>(_info.max_detections(), selected_indices.size());
 
-        SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices,
-                    num_output, max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
+        SaveOutputs(&_decoded_boxes, box_indices, max_scores, max_score_classes, selected_indices, num_output,
+                    max_detections, _output_boxes, _output_classes, _output_scores, _num_detection);
     }
 }
 } // namespace arm_compute

diff --git a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
index 6d01b12..3217742 100644
--- a/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp
+++ b/src/runtime/CPP/functions/CPPNonMaximumSuppression.cpp

@@ -29,9 +29,12 @@
 
 namespace arm_compute
 {
-void CPPNonMaximumSuppression::configure(
-    const ITensor *bboxes, const ITensor *scores, ITensor *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
+void CPPNonMaximumSuppression::configure(const ITensor *bboxes,
+                                         const ITensor *scores,
+                                         ITensor       *indices,
+                                         unsigned int   max_output_size,
+                                         const float    score_threshold,
+                                         const float    nms_threshold)
 {
     ARM_COMPUTE_LOG_PARAMS(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
 
@@ -40,10 +43,14 @@
     _kernel = std::move(k);
 }
 
-Status CPPNonMaximumSuppression::validate(
-    const ITensorInfo *bboxes, const ITensorInfo *scores, const ITensorInfo *indices, unsigned int max_output_size,
-    const float score_threshold, const float nms_threshold)
+Status CPPNonMaximumSuppression::validate(const ITensorInfo *bboxes,
+                                          const ITensorInfo *scores,
+                                          const ITensorInfo *indices,
+                                          unsigned int       max_output_size,
+                                          const float        score_threshold,
+                                          const float        nms_threshold)
 {
-    return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold, nms_threshold);
+    return CPPNonMaximumSuppressionKernel::validate(bboxes, scores, indices, max_output_size, score_threshold,
+                                                    nms_threshold);
 }
 } // namespace arm_compute

diff --git a/src/runtime/CPP/functions/CPPTopKV.cpp b/src/runtime/CPP/functions/CPPTopKV.cpp
index 62a7473..3d64def 100644
--- a/src/runtime/CPP/functions/CPPTopKV.cpp
+++ b/src/runtime/CPP/functions/CPPTopKV.cpp

@@ -38,7 +38,10 @@
     _kernel = std::move(kernel);
 }
 
-Status CPPTopKV::validate(const ITensorInfo *predictions, const ITensorInfo *targets, ITensorInfo *output, const unsigned int k)
+Status CPPTopKV::validate(const ITensorInfo *predictions,
+                          const ITensorInfo *targets,
+                          ITensorInfo       *output,
+                          const unsigned int k)
 {
     return CPPTopKVKernel::validate(predictions, targets, output, k);
 }

diff --git a/src/runtime/IScheduler.cpp b/src/runtime/IScheduler.cpp
index 436fd9c..ecf84ab 100644
--- a/src/runtime/IScheduler.cpp
+++ b/src/runtime/IScheduler.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Log.h"
 #include "arm_compute/core/Window.h"
+
 #include "src/common/cpuinfo/CpuInfo.h"
 #include "src/runtime/SchedulerUtils.h"
 
@@ -59,7 +60,7 @@
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 #ifndef BARE_METAL
     const Window &max_window = window;
-    if(hints.split_dimension() == IScheduler::split_dimensions_all)
+    if (hints.split_dimension() == IScheduler::split_dimensions_all)
     {
         /*
          * if the split dim is size_t max then this signals we should parallelise over
@@ -73,27 +74,27 @@
         std::tie(m_threads, n_threads) = scheduler_utils::split_2d(this->num_threads(), m, n);
 
         std::vector<IScheduler::Workload> workloads;
-        for(unsigned int ni = 0; ni != n_threads; ++ni)
+        for (unsigned int ni = 0; ni != n_threads; ++ni)
         {
-            for(unsigned int mi = 0; mi != m_threads; ++mi)
+            for (unsigned int mi = 0; mi != m_threads; ++mi)
             {
                 workloads.push_back(
-                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo & info)
-                {
-                    //narrow the window to our mi-ni workload
-                    Window win = max_window.split_window(Window::DimX, mi, m_threads)
-                                 .split_window(Window::DimY, ni, n_threads);
+                    [ni, mi, m_threads, n_threads, &max_window, &kernel](const ThreadInfo &info)
+                    {
+                        //narrow the window to our mi-ni workload
+                        Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                         .split_window(Window::DimY, ni, n_threads);
 
-                    win.validate();
+                        win.validate();
 
-                    Window thread_locator;
-                    thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
-                    thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+                        Window thread_locator;
+                        thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+                        thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
 
-                    thread_locator.validate();
+                        thread_locator.validate();
 
-                    kernel->run_nd(win, info, thread_locator);
-                });
+                        kernel->run_nd(win, info, thread_locator);
+                    });
             }
         }
         run_workloads(workloads);
@@ -103,16 +104,16 @@
         const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
         const unsigned int num_threads    = std::min(num_iterations, this->num_threads());
 
-        if(num_iterations == 0)
+        if (num_iterations == 0)
         {
             return;
         }
 
-        if(!kernel->is_parallelisable() || num_threads == 1)
+        if (!kernel->is_parallelisable() || num_threads == 1)
         {
             ThreadInfo info;
             info.cpu_info = &cpu_info();
-            if(tensors.empty())
+            if (tensors.empty())
             {
                 kernel->run(max_window, info);
             }
@@ -124,14 +125,15 @@
         else
         {
             unsigned int num_windows = 0;
-            switch(hints.strategy())
+            switch (hints.strategy())
             {
                 case StrategyHint::STATIC:
                     num_windows = num_threads;
                     break;
                 case StrategyHint::DYNAMIC:
                 {
-                    const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+                    const unsigned int granule_threshold =
+                        (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
                     // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
                     num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
                     break;
@@ -143,15 +145,15 @@
             num_windows = adjust_num_of_windows(max_window, hints.split_dimension(), num_windows, *kernel, cpu_info());
 
             std::vector<IScheduler::Workload> workloads(num_windows);
-            for(unsigned int t = 0; t < num_windows; ++t)
+            for (unsigned int t = 0; t < num_windows; ++t)
             {
                 //Capture 't' by copy, all the other variables by reference:
-                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
                 {
                     Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
                     win.validate();
 
-                    if(tensors.empty())
+                    if (tensors.empty())
                     {
                         kernel->run(win, info);
                     }
@@ -175,36 +177,43 @@
     run_workloads(workloads);
 }
 
-std::size_t IScheduler::adjust_num_of_windows(const Window &window, std::size_t split_dimension, std::size_t init_num_windows, const ICPPKernel &kernel, const CPUInfo &cpu_info)
+std::size_t IScheduler::adjust_num_of_windows(const Window     &window,
+                                              std::size_t       split_dimension,
+                                              std::size_t       init_num_windows,
+                                              const ICPPKernel &kernel,
+                                              const CPUInfo    &cpu_info)
 {
     // Mitigation of the narrow split issue, which occurs when the split dimension is too small to split (hence "narrow").
-    if(window.num_iterations(split_dimension) < init_num_windows)
+    if (window.num_iterations(split_dimension) < init_num_windows)
     {
         auto recommended_split_dim = Window::DimX;
-        for(std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
+        for (std::size_t dims = Window::DimY; dims <= Window::DimW; ++dims)
         {
-            if(window.num_iterations(recommended_split_dim) < window.num_iterations(dims))
+            if (window.num_iterations(recommended_split_dim) < window.num_iterations(dims))
             {
                 recommended_split_dim = dims;
             }
         }
-        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim", split_dimension,
-                                                  recommended_split_dim);
+        ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE(
+            "%zu dimension is not a suitable dimension to split the workload. Recommended: %zu recommended_split_dim",
+            split_dimension, recommended_split_dim);
     }
 
-    for(auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first
+    for (auto t = init_num_windows; t > 0; --t) // Trying the highest number of windows ,init_num_windows, first
     {
         // Try splitting the workload into t, subject to each subworkload size <= mws.
-        if((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t)
+        if ((window.num_iterations(split_dimension) / kernel.get_mws(cpu_info, t)) >= t)
         {
-            if(t != init_num_windows)
+            if (t != init_num_windows)
             {
-                ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using a different thread count than the one assigned by the user.");
+                ARM_COMPUTE_LOG_INFO_MSG_CORE(
+                    "The scheduler is using a different thread count than the one assigned by the user.");
             }
             return t;
         }
     }
-    ARM_COMPUTE_LOG_INFO_MSG_CORE("The scheduler is using single thread instead of the thread count assigned by the user.");
+    ARM_COMPUTE_LOG_INFO_MSG_CORE(
+        "The scheduler is using single thread instead of the thread count assigned by the user.");
     return 1; //  If the workload is so small that it can't be split, we should run a single thread
 }
 

diff --git a/src/runtime/ISimpleLifetimeManager.cpp b/src/runtime/ISimpleLifetimeManager.cpp
index a6bc950..8e5b62a 100644
--- a/src/runtime/ISimpleLifetimeManager.cpp
+++ b/src/runtime/ISimpleLifetimeManager.cpp

@@ -43,7 +43,7 @@
 
 void ISimpleLifetimeManager::register_group(IMemoryGroup *group)
 {
-    if(_active_group == nullptr)
+    if (_active_group == nullptr)
     {
         ARM_COMPUTE_ERROR_ON(group == nullptr);
         _active_group = group;
@@ -52,12 +52,12 @@
 
 bool ISimpleLifetimeManager::release_group(IMemoryGroup *group)
 {
-    if(group == nullptr)
+    if (group == nullptr)
     {
         return false;
     }
     const bool status = bool(_finalized_groups.erase(group));
-    if(status)
+    if (status)
     {
         group->mappings().clear();
     }
@@ -67,12 +67,13 @@
 void ISimpleLifetimeManager::start_lifetime(void *obj)
 {
     ARM_COMPUTE_ERROR_ON(obj == nullptr);
-    ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements), "Memory object is already registered!");
+    ARM_COMPUTE_ERROR_ON_MSG(_active_elements.find(obj) != std::end(_active_elements),
+                             "Memory object is already registered!");
 
     // Check if there is a free blob
-    if(_free_blobs.empty())
+    if (_free_blobs.empty())
     {
-        _occupied_blobs.emplace_front(Blob{ obj, 0, 0, { obj } });
+        _occupied_blobs.emplace_front(Blob{obj, 0, 0, {obj}});
     }
     else
     {
@@ -100,10 +101,8 @@
     el.status    = true;
 
     // Find object in the occupied lists
-    auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs), [&obj](const Blob & b)
-    {
-        return obj == b.id;
-    });
+    auto occupied_blob_it = std::find_if(std::begin(_occupied_blobs), std::end(_occupied_blobs),
+                                         [&obj](const Blob &b) { return obj == b.id; });
     ARM_COMPUTE_ERROR_ON(occupied_blob_it == std::end(_occupied_blobs));
 
     // Update occupied blob and return as free
@@ -114,7 +113,7 @@
     _free_blobs.splice(std::begin(_free_blobs), _occupied_blobs, occupied_blob_it);
 
     // Check if all objects are finalized and reset active group
-    if(are_all_finalized())
+    if (are_all_finalized())
     {
         ARM_COMPUTE_ERROR_ON(!_occupied_blobs.empty());
 
@@ -133,9 +132,7 @@
 
 bool ISimpleLifetimeManager::are_all_finalized() const
 {
-    return !std::any_of(std::begin(_active_elements), std::end(_active_elements), [](const std::pair<void *, Element> &e)
-    {
-        return !e.second.status;
-    });
+    return !std::any_of(std::begin(_active_elements), std::end(_active_elements),
+                        [](const std::pair<void *, Element> &e) { return !e.second.status; });
 }
 } // namespace arm_compute

diff --git a/src/runtime/IWeightsManager.cpp b/src/runtime/IWeightsManager.cpp
index 373c50c..96287dc 100644
--- a/src/runtime/IWeightsManager.cpp
+++ b/src/runtime/IWeightsManager.cpp

@@ -25,14 +25,13 @@
 
 namespace arm_compute
 {
-IWeightsManager::IWeightsManager()
-    : _managed_weights(), _managed_counter(), _managed_weights_parents()
+IWeightsManager::IWeightsManager() : _managed_weights(), _managed_counter(), _managed_weights_parents()
 {
 }
 
 void IWeightsManager::manage(const ITensor *weights, ITransformWeights *parent)
 {
-    if(!are_weights_managed(weights))
+    if (!are_weights_managed(weights))
     {
         _managed_weights[weights];
         _managed_counter[weights];
@@ -44,9 +43,9 @@
 
     // In case the weights are an output of a previous reshape function
     // store the parent's link
-    if(parent != nullptr)
+    if (parent != nullptr)
     {
-        if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+        if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
         {
             _managed_weights_parents[weights] = parent;
         }
@@ -59,13 +58,13 @@
 
     // Find if I have the same weights with weights transform. If I do, don't run the reshape
     auto     item = _managed_weights.find(weights);
-    bool     perform_run{ true };
-    ITensor *weights_tensor{ nullptr };
+    bool     perform_run{true};
+    ITensor *weights_tensor{nullptr};
 
     // Check if I already have the requested transform and I have run the reshape function
-    for(auto it : item->second)
+    for (auto it : item->second)
     {
-        if(it->is_reshape_run() && (it->uid() == weights_transform->uid()))
+        if (it->is_reshape_run() && (it->uid() == weights_transform->uid()))
         {
             weights_tensor = it->get_weights();
             perform_run    = false;
@@ -73,7 +72,7 @@
         }
     }
 
-    if(perform_run)
+    if (perform_run)
     {
         weights_transform->run();
         weights_tensor = weights_transform->get_weights();
@@ -81,10 +80,10 @@
 
     // Check if we can release memory from parent
     auto parent_item = _managed_weights_parents.find(weights);
-    if(parent_item != _managed_weights_parents.end())
+    if (parent_item != _managed_weights_parents.end())
     {
         int32_t refcount = parent_item->second->decrease_refcount();
-        if(refcount == 0)
+        if (refcount == 0)
         {
             parent_item->second->release();
         }
@@ -92,20 +91,20 @@
 
     // Check top level weights. If all the transformations are done
     // mark the weights as unused
-    if(_managed_weights_parents.find(weights) == _managed_weights_parents.end())
+    if (_managed_weights_parents.find(weights) == _managed_weights_parents.end())
     {
         auto item           = _managed_weights.find(weights);
         bool mark_as_unused = true;
-        for(auto it : item->second)
+        for (auto it : item->second)
         {
-            if(!it->is_reshape_run())
+            if (!it->is_reshape_run())
             {
                 mark_as_unused = false;
                 break;
             }
         }
 
-        if(mark_as_unused)
+        if (mark_as_unused)
         {
             weights->mark_as_unused();
         }
@@ -123,15 +122,15 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(!are_weights_managed(weights), "Cannot acquire weights. Weights are not managed");
 
-    ITensor *transformed_weights{ nullptr };
+    ITensor *transformed_weights{nullptr};
     auto     item = _managed_weights.find(weights);
 
     // Check if I already have the requested transform. If I do,
     // increase the refcount of the transformed weights object and
     // reuse the tensor
-    for(auto it : item->second)
+    for (auto it : item->second)
     {
-        if(it->uid() == weights_transform->uid())
+        if (it->uid() == weights_transform->uid())
         {
             transformed_weights = it->get_weights();
             it->increase_refcount();
@@ -139,7 +138,7 @@
         }
     }
 
-    if(transformed_weights == nullptr)
+    if (transformed_weights == nullptr)
     {
         transformed_weights = weights_transform->get_weights();
         weights_transform->increase_refcount();
@@ -154,13 +153,13 @@
 
 void IWeightsManager::release(const ITensor *weights)
 {
-    if(weights == nullptr || !are_weights_managed(weights))
+    if (weights == nullptr || !are_weights_managed(weights))
     {
         return;
     }
 
     _managed_counter[weights].counter--;
-    if(_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused)
+    if (_managed_counter[weights].counter == 0 && _managed_counter[weights].is_unused)
     {
         weights->mark_as_unused();
     }
@@ -168,7 +167,7 @@
 
 void IWeightsManager::pre_mark_as_unused(const ITensor *weights)
 {
-    if(weights == nullptr || !are_weights_managed(weights))
+    if (weights == nullptr || !are_weights_managed(weights))
     {
         return;
     }

diff --git a/src/runtime/Memory.cpp b/src/runtime/Memory.cpp
index ac0a325..90fd025 100644
--- a/src/runtime/Memory.cpp
+++ b/src/runtime/Memory.cpp

@@ -27,20 +27,17 @@
 
 namespace arm_compute
 {
-Memory::Memory()
-    : _region(nullptr), _region_owned(nullptr)
+Memory::Memory() : _region(nullptr), _region_owned(nullptr)
 {
 }
 
-Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory)
-    : _region(nullptr), _region_owned(memory)
+Memory::Memory(const std::shared_ptr<IMemoryRegion> &memory) : _region(nullptr), _region_owned(memory)
 {
     _region_owned = memory;
     _region       = _region_owned.get();
 }
 
-Memory::Memory(IMemoryRegion *memory)
-    : _region(memory), _region_owned(nullptr)
+Memory::Memory(IMemoryRegion *memory) : _region(memory), _region_owned(nullptr)
 {
     _region = memory;
 }

diff --git a/src/runtime/MemoryManagerOnDemand.cpp b/src/runtime/MemoryManagerOnDemand.cpp
index 2e418ae..5fa9ea4 100644
--- a/src/runtime/MemoryManagerOnDemand.cpp
+++ b/src/runtime/MemoryManagerOnDemand.cpp

@@ -31,7 +31,8 @@
 
 namespace arm_compute
 {
-MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager, std::shared_ptr<IPoolManager> pool_manager)
+MemoryManagerOnDemand::MemoryManagerOnDemand(std::shared_ptr<ILifetimeManager> lifetime_manager,
+                                             std::shared_ptr<IPoolManager>     pool_manager)
     : _lifetime_mgr(std::move(lifetime_manager)), _pool_mgr(std::move(pool_manager))
 {
     ARM_COMPUTE_ERROR_ON_MSG(!_lifetime_mgr, "Lifetime manager not specified correctly!");
@@ -57,7 +58,7 @@
 
     // Create pools
     auto pool_template = _lifetime_mgr->create_pool(&allocator);
-    for(int i = num_pools; i > 1; --i)
+    for (int i = num_pools; i > 1; --i)
     {
         auto pool = pool_template->duplicate();
         _pool_mgr->register_pool(std::move(pool));

diff --git a/src/runtime/NEON/INEOperator.cpp b/src/runtime/NEON/INEOperator.cpp
index a5fc0a2..fcfd325 100644
--- a/src/runtime/NEON/INEOperator.cpp
+++ b/src/runtime/NEON/INEOperator.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/INEOperator.h"
+
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/INEKernel.h"
 
 namespace arm_compute
@@ -32,14 +34,13 @@
 {
 INEOperator::~INEOperator() = default;
 
-INEOperator::INEOperator(IRuntimeContext *ctx)
-    : _kernel(), _ctx(ctx), _workspace()
+INEOperator::INEOperator(IRuntimeContext *ctx) : _kernel(), _ctx(ctx), _workspace()
 {
 }
 
 void INEOperator::run(ITensorPack &tensors)
 {
-    if(tensors.empty())
+    if (tensors.empty())
     {
         ARM_COMPUTE_ERROR("No inputs provided");
     }

diff --git a/src/runtime/NEON/INESimpleFunction.cpp b/src/runtime/NEON/INESimpleFunction.cpp
index 5438bce..b697722 100644
--- a/src/runtime/NEON/INESimpleFunction.cpp
+++ b/src/runtime/NEON/INESimpleFunction.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/CPP/ICPPKernel.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
@@ -33,8 +34,7 @@
 INESimpleFunction::~INESimpleFunction() = default;
 
 INESimpleFunction::INESimpleFunction() // NOLINT
-    : _kernel(),
-      _border_handler()
+    : _kernel(), _border_handler()
 {
 }
 

diff --git a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
index 21dd58e..04bff9f 100644
--- a/src/runtime/NEON/INESimpleFunctionNoBorder.cpp
+++ b/src/runtime/NEON/INESimpleFunctionNoBorder.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/INEKernel.h"
 #include "src/runtime/Utils.h"
 
@@ -32,9 +33,7 @@
 {
 INESimpleFunctionNoBorder::~INESimpleFunctionNoBorder() = default;
 
-INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx)
-    : _kernel(),
-      _ctx(ctx)
+INESimpleFunctionNoBorder::INESimpleFunctionNoBorder(IRuntimeContext *ctx) : _kernel(), _ctx(ctx)
 {
 }
 

diff --git a/src/runtime/NEON/functions/NEActivationLayer.cpp b/src/runtime/NEON/functions/NEActivationLayer.cpp
index e48aede..5919945 100644
--- a/src/runtime/NEON/functions/NEActivationLayer.cpp
+++ b/src/runtime/NEON/functions/NEActivationLayer.cpp

@@ -24,24 +24,24 @@
 #include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuActivation.h"
 
 namespace arm_compute
 {
 struct NEActivationLayer::Impl
 {
-    const ITensor                      *src{ nullptr };
-    ITensor                            *dst{ nullptr };
-    IRuntimeContext                    *ctx{ nullptr };
-    std::unique_ptr<cpu::CpuActivation> op{ nullptr };
+    const ITensor                      *src{nullptr};
+    ITensor                            *dst{nullptr};
+    IRuntimeContext                    *ctx{nullptr};
+    std::unique_ptr<cpu::CpuActivation> op{nullptr};
 };
 
-NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx)
-    : _impl(std::make_unique<Impl>())
+NEActivationLayer::NEActivationLayer(IRuntimeContext *ctx) : _impl(std::make_unique<Impl>())
 {
     _impl->ctx = ctx;
 }
-NEActivationLayer::NEActivationLayer(NEActivationLayer &&) = default;
+NEActivationLayer::NEActivationLayer(NEActivationLayer &&)            = default;
 NEActivationLayer &NEActivationLayer::operator=(NEActivationLayer &&) = default;
 NEActivationLayer::~NEActivationLayer()                               = default;
 
@@ -56,7 +56,8 @@
     _impl->op->configure(_impl->src->info(), _impl->dst->info(), activation_info);
 }
 
-Status NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status
+NEActivationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
 {
     return cpu::CpuActivation::validate(input, output, act_info);
 }

diff --git a/src/runtime/NEON/functions/NEAddMulAdd.cpp b/src/runtime/NEON/functions/NEAddMulAdd.cpp
index cfeaefc..a723647 100644
--- a/src/runtime/NEON/functions/NEAddMulAdd.cpp
+++ b/src/runtime/NEON/functions/NEAddMulAdd.cpp

@@ -25,6 +25,7 @@
 #include "arm_compute/runtime/NEON/functions/NEAddMulAdd.h"
 
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuAddMulAdd.h"
@@ -33,45 +34,50 @@
 {
 struct NEAddMulAdd::Impl
 {
-    std::unique_ptr<cpu::CpuAddMulAdd> op{ nullptr };
+    std::unique_ptr<cpu::CpuAddMulAdd> op{nullptr};
     WorkspaceData<Tensor>              workspace_tensors{};
     ITensorPack                        run_pack{};
     MemoryGroup                        memory_group{};
 };
 
-NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEAddMulAdd::NEAddMulAdd(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
 
 NEAddMulAdd::~NEAddMulAdd() = default;
 
-void NEAddMulAdd::configure(ITensor *input1, ITensor *input2, ITensor *bn_mul, ITensor *bn_add, ITensor *add_output,
-                            ITensor *final_output, const ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEAddMulAdd::configure(ITensor                   *input1,
+                            ITensor                   *input2,
+                            ITensor                   *bn_mul,
+                            ITensor                   *bn_add,
+                            ITensor                   *add_output,
+                            ITensor                   *final_output,
+                            const ConvertPolicy        policy,
+                            const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 
-    _impl->op           = std::make_unique<cpu::CpuAddMulAdd>();
-    _impl->op->configure(input1->info(), input2->info(), bn_mul->info(),
-                         bn_add->info(), add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info);
+    _impl->op = std::make_unique<cpu::CpuAddMulAdd>();
+    _impl->op->configure(input1->info(), input2->info(), bn_mul->info(), bn_add->info(),
+                         add_output != nullptr ? add_output->info() : nullptr, final_output->info(), policy, act_info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input1 },
-        { TensorType::ACL_SRC_1, input2 },
-        { TensorType::ACL_SRC_2, bn_mul },
-        { TensorType::ACL_SRC_3, bn_add },
-        { TensorType::ACL_DST_0, add_output },
-        { TensorType::ACL_DST_1, final_output },
+    _impl->run_pack = {
+        {TensorType::ACL_SRC_0, input1}, {TensorType::ACL_SRC_1, input2},     {TensorType::ACL_SRC_2, bn_mul},
+        {TensorType::ACL_SRC_3, bn_add}, {TensorType::ACL_DST_0, add_output}, {TensorType::ACL_DST_1, final_output},
     };
 
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEAddMulAdd::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *bn_mul,
-                             const ITensorInfo *bn_add, const ITensorInfo *add_output, const ITensorInfo *final_output,
-                             ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEAddMulAdd::validate(const ITensorInfo         *input1,
+                             const ITensorInfo         *input2,
+                             const ITensorInfo         *bn_mul,
+                             const ITensorInfo         *bn_add,
+                             const ITensorInfo         *add_output,
+                             const ITensorInfo         *final_output,
+                             ConvertPolicy              policy,
+                             const ActivationLayerInfo &act_info)
 {
     return cpu::CpuAddMulAdd::validate(input1, input2, bn_mul, bn_add, add_output, final_output, policy, act_info);
 }

diff --git a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
index 3ac127b..fbaf1a9 100644
--- a/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp
+++ b/src/runtime/NEON/functions/NEArgMinMaxLayer.cpp

@@ -32,6 +32,7 @@
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 #include "arm_compute/runtime/NEON/functions/NEReductionOperation.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
@@ -48,8 +49,7 @@
 
 NEArgMinMaxLayer::~NEArgMinMaxLayer() = default;
 
-NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEArgMinMaxLayer::NEArgMinMaxLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_manager = std::move(memory_manager);
 }
@@ -58,7 +58,8 @@
 {
     ARM_COMPUTE_LOG_PARAMS(input, axis, output, op);
     _impl->reduction_function = std::make_unique<NEReductionOperation>();
-    if(output->info() && (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64))
+    if (output->info() &&
+        (output->info()->data_type() == DataType::S64 || output->info()->data_type() == DataType::U64))
     {
         _impl->memory_group         = MemoryGroup(std::move(_impl->memory_manager));
         _impl->cast_function        = std::make_unique<NECast>();
@@ -74,9 +75,11 @@
     }
 }
 
-Status NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
+Status
+NEArgMinMaxLayer::validate(const ITensorInfo *input, int axis, const ITensorInfo *output, const ReductionOperation &op)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN, "Invalid operation");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(op != ReductionOperation::ARG_IDX_MAX && op != ReductionOperation::ARG_IDX_MIN,
+                                    "Invalid operation");
     return NEReductionOperation::validate(input, output, axis, op, false);
 }
 
@@ -84,7 +87,7 @@
 {
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
     _impl->reduction_function->run();
-    if(_impl->tmp_reduction_result != nullptr)
+    if (_impl->tmp_reduction_result != nullptr)
     {
         _impl->cast_function->run();
     }

diff --git a/src/runtime/NEON/functions/NEArithmeticAddition.cpp b/src/runtime/NEON/functions/NEArithmeticAddition.cpp
index a7581ca..aff16ae 100644
--- a/src/runtime/NEON/functions/NEArithmeticAddition.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticAddition.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticAddition.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuAdd.h"
 
 #include <utility>
@@ -32,26 +33,33 @@
 {
 struct NEArithmeticAddition::Impl
 {
-    const ITensor               *src_0{ nullptr };
-    const ITensor               *src_1{ nullptr };
-    ITensor                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuAdd> op{ nullptr };
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuAdd> op{nullptr};
 };
 
-NEArithmeticAddition::NEArithmeticAddition()
-    : _impl(std::make_unique<Impl>())
+NEArithmeticAddition::NEArithmeticAddition() : _impl(std::make_unique<Impl>())
 {
 }
-NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&) = default;
+NEArithmeticAddition::NEArithmeticAddition(NEArithmeticAddition &&)            = default;
 NEArithmeticAddition &NEArithmeticAddition::operator=(NEArithmeticAddition &&) = default;
 NEArithmeticAddition::~NEArithmeticAddition()                                  = default;
 
-Status NEArithmeticAddition::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticAddition::validate(const ITensorInfo         *input1,
+                                      const ITensorInfo         *input2,
+                                      const ITensorInfo         *output,
+                                      ConvertPolicy              policy,
+                                      const ActivationLayerInfo &act_info)
 {
     return cpu::CpuAdd::validate(input1, input2, output, policy, act_info);
 }
 
-void NEArithmeticAddition::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticAddition::configure(const ITensor             *input1,
+                                     const ITensor             *input2,
+                                     ITensor                   *output,
+                                     ConvertPolicy              policy,
+                                     const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;

diff --git a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
index 6fdd426..097525c 100644
--- a/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp
+++ b/src/runtime/NEON/functions/NEArithmeticSubtraction.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEArithmeticSubtraction.h"
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/operators/CpuSub.h"
 
 #include <utility>
@@ -32,26 +33,33 @@
 {
 struct NEArithmeticSubtraction::Impl
 {
-    const ITensor               *src_0{ nullptr };
-    const ITensor               *src_1{ nullptr };
-    ITensor                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuSub> op{ nullptr };
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuSub> op{nullptr};
 };
 
-NEArithmeticSubtraction::NEArithmeticSubtraction()
-    : _impl(std::make_unique<Impl>())
+NEArithmeticSubtraction::NEArithmeticSubtraction() : _impl(std::make_unique<Impl>())
 {
 }
-NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&) = default;
+NEArithmeticSubtraction::NEArithmeticSubtraction(NEArithmeticSubtraction &&)            = default;
 NEArithmeticSubtraction &NEArithmeticSubtraction::operator=(NEArithmeticSubtraction &&) = default;
 NEArithmeticSubtraction::~NEArithmeticSubtraction()                                     = default;
 
-Status NEArithmeticSubtraction::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+Status NEArithmeticSubtraction::validate(const ITensorInfo         *input1,
+                                         const ITensorInfo         *input2,
+                                         const ITensorInfo         *output,
+                                         ConvertPolicy              policy,
+                                         const ActivationLayerInfo &act_info)
 {
     return cpu::CpuSub::validate(input1, input2, output, policy, act_info);
 }
 
-void NEArithmeticSubtraction::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info)
+void NEArithmeticSubtraction::configure(const ITensor             *input1,
+                                        const ITensor             *input2,
+                                        ITensor                   *output,
+                                        ConvertPolicy              policy,
+                                        const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;

diff --git a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
index db49f4c..d491f0a 100644
--- a/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchNormalizationLayer.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEBatchNormalizationLayerKernel.h"
 
@@ -36,12 +37,17 @@
 {
 NEBatchNormalizationLayer::~NEBatchNormalizationLayer() = default;
 
-NEBatchNormalizationLayer::NEBatchNormalizationLayer()
-    : _norm_kernel()
+NEBatchNormalizationLayer::NEBatchNormalizationLayer() : _norm_kernel()
 {
 }
 
-void NEBatchNormalizationLayer::configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon,
+void NEBatchNormalizationLayer::configure(ITensor            *input,
+                                          ITensor            *output,
+                                          const ITensor      *mean,
+                                          const ITensor      *var,
+                                          const ITensor      *beta,
+                                          const ITensor      *gamma,
+                                          float               epsilon,
                                           ActivationLayerInfo act_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, mean, var, beta, gamma, epsilon, act_info);
@@ -50,10 +56,17 @@
     _norm_kernel->configure(input, output, mean, var, beta, gamma, epsilon, act_info);
 }
 
-Status NEBatchNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta, const ITensorInfo *gamma,
-                                           float epsilon, ActivationLayerInfo act_info)
+Status NEBatchNormalizationLayer::validate(const ITensorInfo  *input,
+                                           const ITensorInfo  *output,
+                                           const ITensorInfo  *mean,
+                                           const ITensorInfo  *var,
+                                           const ITensorInfo  *beta,
+                                           const ITensorInfo  *gamma,
+                                           float               epsilon,
+                                           ActivationLayerInfo act_info)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEBatchNormalizationLayerKernel::validate(input, output, mean, var, beta, gamma, epsilon, act_info));
     return Status{};
 }
 

diff --git a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
index e258028..5d711c5 100644
--- a/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEBatchToSpaceLayer.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEBatchToSpaceLayerKernel.h"
 
@@ -41,19 +42,25 @@
     _kernel = std::move(k);
 }
 
-void NEBatchToSpaceLayer::configure(const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
+void NEBatchToSpaceLayer::configure(
+    const ITensor *input, int32_t block_shape_x, int32_t block_shape_y, ITensor *output, const CropInfo &crop_info)
 {
     auto k = std::make_unique<NEBatchToSpaceLayerKernel>();
     k->configure(input, block_shape_x, block_shape_y, output, crop_info);
     _kernel = std::move(k);
 }
 
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
+Status
+NEBatchToSpaceLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *output)
 {
     return NEBatchToSpaceLayerKernel::validate(input, block_shape, output);
 }
 
-Status NEBatchToSpaceLayer::validate(const ITensorInfo *input, int32_t block_shape_x, int32_t block_shape_y, const ITensorInfo *output, const CropInfo &crop_info)
+Status NEBatchToSpaceLayer::validate(const ITensorInfo *input,
+                                     int32_t            block_shape_x,
+                                     int32_t            block_shape_y,
+                                     const ITensorInfo *output,
+                                     const CropInfo    &crop_info)
 {
     return NEBatchToSpaceLayerKernel::validate(input, block_shape_x, block_shape_y, output, crop_info);
 }

diff --git a/src/runtime/NEON/functions/NEBitwiseAnd.cpp b/src/runtime/NEON/functions/NEBitwiseAnd.cpp
index 90eb727..89ce208 100644
--- a/src/runtime/NEON/functions/NEBitwiseAnd.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseAnd.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseAnd.h"
 
-#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseAndKernel.h"
 
 #include <utility>
 

diff --git a/src/runtime/NEON/functions/NEBitwiseNot.cpp b/src/runtime/NEON/functions/NEBitwiseNot.cpp
index 69e5288..eda59cd 100644
--- a/src/runtime/NEON/functions/NEBitwiseNot.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseNot.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseNot.h"
 
-#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseNotKernel.h"
 
 #include <utility>
 

diff --git a/src/runtime/NEON/functions/NEBitwiseOr.cpp b/src/runtime/NEON/functions/NEBitwiseOr.cpp
index 0b19e91..3d6f30b 100644
--- a/src/runtime/NEON/functions/NEBitwiseOr.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseOr.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseOr.h"
 
-#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseOrKernel.h"
 
 #include <utility>
 

diff --git a/src/runtime/NEON/functions/NEBitwiseXor.cpp b/src/runtime/NEON/functions/NEBitwiseXor.cpp
index cc9df9f..f0cf3d3 100644
--- a/src/runtime/NEON/functions/NEBitwiseXor.cpp
+++ b/src/runtime/NEON/functions/NEBitwiseXor.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEBitwiseXor.h"
 
-#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEBitwiseXorKernel.h"
 
 #include <utility>
 

diff --git a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
index af00171..adf891e 100644
--- a/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp
+++ b/src/runtime/NEON/functions/NEBoundingBoxTransform.cpp

@@ -22,12 +22,16 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEBoundingBoxTransform.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEBoundingBoxTransformKernel.h"
 
 namespace arm_compute
 {
-void NEBoundingBoxTransform::configure(const ITensor *boxes, ITensor *pred_boxes, const ITensor *deltas, const BoundingBoxTransformInfo &info)
+void NEBoundingBoxTransform::configure(const ITensor                  *boxes,
+                                       ITensor                        *pred_boxes,
+                                       const ITensor                  *deltas,
+                                       const BoundingBoxTransformInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(boxes, pred_boxes, deltas, info);
     // Configure Bounding Box kernel
@@ -36,7 +40,10 @@
     _kernel = std::move(k);
 }
 
-Status NEBoundingBoxTransform::validate(const ITensorInfo *boxes, const ITensorInfo *pred_boxes, const ITensorInfo *deltas, const BoundingBoxTransformInfo &info)
+Status NEBoundingBoxTransform::validate(const ITensorInfo              *boxes,
+                                        const ITensorInfo              *pred_boxes,
+                                        const ITensorInfo              *deltas,
+                                        const BoundingBoxTransformInfo &info)
 {
     return NEBoundingBoxTransformKernel::validate(boxes, pred_boxes, deltas, info);
 }

diff --git a/src/runtime/NEON/functions/NECast.cpp b/src/runtime/NEON/functions/NECast.cpp
index f93a6ea..1fd172a 100644
--- a/src/runtime/NEON/functions/NECast.cpp
+++ b/src/runtime/NEON/functions/NECast.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NECast.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuCast.h"
 
@@ -31,16 +32,15 @@
 {
 struct NECast::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<cpu::CpuCast> op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCast> op{nullptr};
 };
 
-NECast::NECast()
-    : _impl(std::make_unique<Impl>())
+NECast::NECast() : _impl(std::make_unique<Impl>())
 {
 }
-NECast::NECast(NECast &&) = default;
+NECast::NECast(NECast &&)            = default;
 NECast &NECast::operator=(NECast &&) = default;
 NECast::~NECast()                    = default;
 
@@ -62,7 +62,7 @@
 
 void NECast::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
index 8b96fad..86bee4d 100644
--- a/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp
+++ b/src/runtime/NEON/functions/NEChannelShuffleLayer.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEChannelShuffleLayer.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEChannelShuffleLayerKernel.h"
 

diff --git a/src/runtime/NEON/functions/NEConcatenateLayer.cpp b/src/runtime/NEON/functions/NEConcatenateLayer.cpp
index ceb697a..59a0892 100644
--- a/src/runtime/NEON/functions/NEConcatenateLayer.cpp
+++ b/src/runtime/NEON/functions/NEConcatenateLayer.cpp

@@ -23,33 +23,31 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEConcatenateLayer.h"
 
-#include "src/cpu/operators/CpuConcatenate.h"
-
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/cpu/operators/CpuConcatenate.h"
 
 namespace arm_compute
 {
 struct NEConcatenateLayer::Impl
 {
     std::vector<const ITensor *>         srcs{};
-    ITensor                             *dst{ nullptr };
-    unsigned int                         num_inputs{ 0 };
-    unsigned int                         axis{ 0 };
-    std::unique_ptr<cpu::CpuConcatenate> op{ nullptr };
+    ITensor                             *dst{nullptr};
+    unsigned int                         num_inputs{0};
+    unsigned int                         axis{0};
+    std::unique_ptr<cpu::CpuConcatenate> op{nullptr};
 };
 
-NEConcatenateLayer::NEConcatenateLayer()
-    : _impl(std::make_unique<Impl>())
+NEConcatenateLayer::NEConcatenateLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&) = default;
+NEConcatenateLayer::NEConcatenateLayer(NEConcatenateLayer &&)            = default;
 NEConcatenateLayer &NEConcatenateLayer::operator=(NEConcatenateLayer &&) = default;
 NEConcatenateLayer::~NEConcatenateLayer()                                = default;
 
@@ -64,7 +62,7 @@
     _impl->op         = std::make_unique<cpu::CpuConcatenate>();
 
     std::vector<const ITensorInfo *> inputs_vector_info;
-    for(unsigned int i = 0; i < inputs_vector.size(); ++i)
+    for (unsigned int i = 0; i < inputs_vector.size(); ++i)
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR(inputs_vector.at(i));
         inputs_vector_info.emplace_back(inputs_vector.at(i)->info());
@@ -72,7 +70,9 @@
     _impl->op->configure(inputs_vector_info, _impl->dst->info(), axis);
 }
 
-Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector, const ITensorInfo *output, size_t axis)
+Status NEConcatenateLayer::validate(const std::vector<const ITensorInfo *> &inputs_vector,
+                                    const ITensorInfo                      *output,
+                                    size_t                                  axis)
 {
     return cpu::CpuConcatenate::validate(inputs_vector, output, axis);
 }
@@ -80,7 +80,7 @@
 void NEConcatenateLayer::run()
 {
     ITensorPack pack;
-    for(unsigned i = 0; i < _impl->num_inputs; ++i)
+    for (unsigned i = 0; i < _impl->num_inputs; ++i)
     {
         pack.add_tensor(TensorType::ACL_SRC_VEC + i, _impl->srcs.at(i));
     }

diff --git a/src/runtime/NEON/functions/NEConv3D.cpp b/src/runtime/NEON/functions/NEConv3D.cpp
index 3bb66c4..8f41151 100644
--- a/src/runtime/NEON/functions/NEConv3D.cpp
+++ b/src/runtime/NEON/functions/NEConv3D.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuDirectConv3d.h"
 
@@ -35,35 +36,41 @@
 
 struct NEConv3D::Impl
 {
-    std::unique_ptr<cpu::ICpuOperator> op{ nullptr };
+    std::unique_ptr<cpu::ICpuOperator> op{nullptr};
     ITensorPack                        run_pack{};
 };
 
-NEConv3D::NEConv3D()
-    : _impl(std::make_unique<Impl>())
+NEConv3D::NEConv3D() : _impl(std::make_unique<Impl>())
 {
 }
 
 NEConv3D::~NEConv3D() = default;
 
-void NEConv3D::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info)
+void NEConv3D::configure(
+    ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv3dInfo &conv_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info));
+    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuDirectConv3d::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info);
 
     auto f = std::make_unique<cpu::CpuDirectConv3d>();
-    f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info);
+    f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(),
+                 conv_info);
     _impl->op = std::move(f);
 
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
-        _impl->run_pack = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
+        _impl->run_pack = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
     }
 }
 
-Status NEConv3D::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv3dInfo &conv_info)
+Status NEConv3D::validate(const ITensorInfo *input,
+                          const ITensorInfo *weights,
+                          const ITensorInfo *biases,
+                          const ITensorInfo *output,
+                          const Conv3dInfo  &conv_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuDirectConv3d::validate(input, weights, biases, output, conv_info));
 
@@ -72,7 +79,7 @@
 
 void NEConv3D::run()
 {
-    if(_impl->op != nullptr)
+    if (_impl->op != nullptr)
     {
         _impl->op->run(_impl->run_pack);
     }

diff --git a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
index 535ac99..84e8565 100644
--- a/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp
+++ b/src/runtime/NEON/functions/NEConvertFullyConnectedWeights.cpp

@@ -24,24 +24,26 @@
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuConvertFullyConnectedWeights.h"
 
 namespace arm_compute
 {
 struct NEConvertFullyConnectedWeights::Impl
 {
-    const ITensor                                        *src{ nullptr };
-    ITensor                                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{ nullptr };
+    const ITensor                                        *src{nullptr};
+    ITensor                                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuConvertFullyConnectedWeights> op{nullptr};
 };
-NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights()
-    : _impl(std::make_unique<Impl>())
+NEConvertFullyConnectedWeights::NEConvertFullyConnectedWeights() : _impl(std::make_unique<Impl>())
 {
 }
 NEConvertFullyConnectedWeights::~NEConvertFullyConnectedWeights() = default;
 
-void NEConvertFullyConnectedWeights::configure(const ITensor *input, ITensor *output, const TensorShape &original_input_shape,
-                                               DataLayout data_layout)
+void NEConvertFullyConnectedWeights::configure(const ITensor     *input,
+                                               ITensor           *output,
+                                               const TensorShape &original_input_shape,
+                                               DataLayout         data_layout)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
@@ -51,8 +53,10 @@
     _impl->op->configure(_impl->src->info(), _impl->dst->info(), original_input_shape, data_layout);
 }
 
-Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input, const ITensorInfo *output, const TensorShape &original_input_shape,
-                                                DataLayout data_layout)
+Status NEConvertFullyConnectedWeights::validate(const ITensorInfo *input,
+                                                const ITensorInfo *output,
+                                                const TensorShape &original_input_shape,
+                                                DataLayout         data_layout)
 {
     return cpu::CpuConvertFullyConnectedWeights::validate(input, output, original_input_shape, data_layout);
 }
@@ -64,4 +68,4 @@
     pack.add_tensor(TensorType::ACL_DST, _impl->dst);
     _impl->op->run(pack);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 89e0e49..37958fc 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/functions/NEFFTConvolutionLayer.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuConv2d.h"
@@ -43,34 +44,44 @@
 {
     MemoryGroup                        memory_group{};
     std::shared_ptr<IMemoryManager>    memory_manager{};
-    std::unique_ptr<cpu::ICpuOperator> op{ nullptr };
+    std::unique_ptr<cpu::ICpuOperator> op{nullptr};
     ITensorPack                        run_pack{};
     ITensorPack                        prep_pack{};
     WorkspaceData<Tensor>              workspace{};
     experimental::MemoryRequirements   aux_mem_req{};
-    std::unique_ptr<IFunction>         func{ nullptr };
+    std::unique_ptr<IFunction>         func{nullptr};
 };
 
-NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_manager = std::move(memory_manager);
 }
 
 NEConvolutionLayer::~NEConvolutionLayer() = default;
 
-void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                   const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void NEConvolutionLayer::configure(ITensor                   *input,
+                                   const ITensor             *weights,
+                                   const ITensor             *biases,
+                                   ITensor                   *output,
+                                   const PadStrideInfo       &conv_info,
+                                   const WeightsInfo         &weights_info,
+                                   const Size2D              &dilation,
+                                   const ActivationLayerInfo &act_info,
+                                   bool                       enable_fast_math,
+                                   unsigned int               num_groups)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(num_groups);
-    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
-                                                            enable_fast_math, num_groups));
-    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(
+        input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info,
+        weights_info, dilation, act_info, enable_fast_math, num_groups));
+    ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                           enable_fast_math, num_groups);
 
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
-    switch(cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (cpu::CpuConv2d::get_convolution_method(input->info(), weights->info(), output->info(), conv_info,
+                                                   weights_info, dilation, act_info, enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::GEMM:
@@ -78,7 +89,8 @@
         case ConvolutionMethod::DIRECT:
         {
             auto f = std::make_unique<cpu::CpuConv2d>();
-            f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+            f->configure(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr),
+                         output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
             _impl->op = std::move(f);
             break;
         }
@@ -94,33 +106,46 @@
             break;
     }
 
-    if(_impl->op)
+    if (_impl->op)
     {
         _impl->memory_group = MemoryGroup(std::move(_impl->memory_manager));
         _impl->aux_mem_req  = _impl->op->workspace();
-        _impl->run_pack     = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-        _impl->prep_pack    = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-        _impl->workspace    = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+        _impl->run_pack     = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+        _impl->prep_pack    = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+        _impl->workspace =
+            manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
     }
 }
 
-Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                    const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status NEConvolutionLayer::validate(const ITensorInfo         *input,
+                                    const ITensorInfo         *weights,
+                                    const ITensorInfo         *biases,
+                                    const ITensorInfo         *output,
+                                    const PadStrideInfo       &conv_info,
+                                    const WeightsInfo         &weights_info,
+                                    const Size2D              &dilation,
+                                    const ActivationLayerInfo &act_info,
+                                    bool                       enable_fast_math,
+                                    unsigned int               num_groups)
 {
     const Conv2dInfo info(conv_info, dilation, act_info, enable_fast_math, num_groups);
 
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(!weights->are_values_constant(), "Dynamic weights are not supported");
 
-    switch(cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math))
+    switch (cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                                   enable_fast_math))
     {
         case ConvolutionMethod::WINOGRAD:
         case ConvolutionMethod::GEMM:
         case ConvolutionMethod::GEMM_CONV2D:
         case ConvolutionMethod::DIRECT:
-            ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups));
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuConv2d::validate(input, weights, biases, output, conv_info,
+                                                                 weights_info, dilation, act_info, enable_fast_math,
+                                                                 num_groups));
             break;
         case ConvolutionMethod::FFT:
-            ARM_COMPUTE_RETURN_ON_ERROR(NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEFFTConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info));
             break;
         default:
             ARM_COMPUTE_ERROR("Not supported.");
@@ -129,12 +154,17 @@
     return Status{};
 }
 
-ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
-                                                             const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                                             const WeightsInfo &weights_info, const Size2D &dilation,
-                                                             const ActivationLayerInfo &act_info, bool enable_fast_math)
+ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo         *input,
+                                                             const ITensorInfo         *weights,
+                                                             const ITensorInfo         *output,
+                                                             const PadStrideInfo       &conv_info,
+                                                             const WeightsInfo         &weights_info,
+                                                             const Size2D              &dilation,
+                                                             const ActivationLayerInfo &act_info,
+                                                             bool                       enable_fast_math)
 {
-    return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info, enable_fast_math);
+    return cpu::CpuConv2d::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info,
+                                                  enable_fast_math);
 }
 
 void NEConvolutionLayer::run()
@@ -143,7 +173,7 @@
 
     MemoryGroupResourceScope scope_mg(_impl->memory_group);
 
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->run();
     }
@@ -155,7 +185,7 @@
 
 void NEConvolutionLayer::prepare()
 {
-    if(_impl->func)
+    if (_impl->func)
     {
         _impl->func->prepare();
     }

diff --git a/src/runtime/NEON/functions/NECopy.cpp b/src/runtime/NEON/functions/NECopy.cpp
index c2059e8..c975d3a 100644
--- a/src/runtime/NEON/functions/NECopy.cpp
+++ b/src/runtime/NEON/functions/NECopy.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NECopy.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuCopy.h"
 
 #include <utility>
@@ -32,16 +33,15 @@
 {
 struct NECopy::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<cpu::CpuCopy> op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCopy> op{nullptr};
 };
 
-NECopy::NECopy()
-    : _impl(std::make_unique<Impl>())
+NECopy::NECopy() : _impl(std::make_unique<Impl>())
 {
 }
-NECopy::NECopy(NECopy &&) = default;
+NECopy::NECopy(NECopy &&)            = default;
 NECopy &NECopy::operator=(NECopy &&) = default;
 NECopy::~NECopy()                    = default;
 

diff --git a/src/runtime/NEON/functions/NECropResize.cpp b/src/runtime/NEON/functions/NECropResize.cpp
index cca8b40..a94b088 100644
--- a/src/runtime/NEON/functions/NECropResize.cpp
+++ b/src/runtime/NEON/functions/NECropResize.cpp

@@ -21,10 +21,11 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/runtime/NEON/functions/NECropResize.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NECropKernel.h"
 
@@ -35,18 +36,32 @@
 NECropResize::~NECropResize() = default;
 
 NECropResize::NECropResize()
-    : _output(nullptr), _num_boxes(0), _method(), _extrapolation_value(0), _crop(), _scale(), _crop_results(), _scaled_results()
+    : _output(nullptr),
+      _num_boxes(0),
+      _method(),
+      _extrapolation_value(0),
+      _crop(),
+      _scale(),
+      _crop_results(),
+      _scaled_results()
 {
 }
 
-Status NECropResize::validate(const ITensorInfo *input, const ITensorInfo *boxes, const ITensorInfo *box_ind, const ITensorInfo *output,
-                              Coordinates2D crop_size, InterpolationPolicy method, float extrapolation_value)
+Status NECropResize::validate(const ITensorInfo  *input,
+                              const ITensorInfo  *boxes,
+                              const ITensorInfo  *box_ind,
+                              const ITensorInfo  *output,
+                              Coordinates2D       crop_size,
+                              InterpolationPolicy method,
+                              float               extrapolation_value)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(crop_size.x <= 0 || crop_size.y <= 0);
     ARM_COMPUTE_RETURN_ERROR_ON(method == InterpolationPolicy::AREA);
     TensorInfo temp_info;
-    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(), box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1, extrapolation_value));
-    if(output->total_size() > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(NECropKernel::validate(input->clone().get(), boxes->clone().get(),
+                                                       box_ind->clone().get(), &temp_info, boxes->tensor_shape()[1] - 1,
+                                                       extrapolation_value));
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, output);
@@ -56,11 +71,17 @@
     return Status{};
 }
 
-void NECropResize::configure(const ITensor *input, const ITensor *boxes, const ITensor *box_ind, ITensor *output, Coordinates2D crop_size,
-                             InterpolationPolicy method, float extrapolation_value)
+void NECropResize::configure(const ITensor      *input,
+                             const ITensor      *boxes,
+                             const ITensor      *box_ind,
+                             ITensor            *output,
+                             Coordinates2D       crop_size,
+                             InterpolationPolicy method,
+                             float               extrapolation_value)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(), crop_size, method, extrapolation_value));
+    ARM_COMPUTE_ERROR_THROW_ON(NECropResize::validate(input->info(), boxes->info(), box_ind->info(), output->info(),
+                                                      crop_size, method, extrapolation_value));
     ARM_COMPUTE_LOG_PARAMS(input, boxes, box_ind, output, crop_size, method, extrapolation_value);
 
     _num_boxes = boxes->info()->tensor_shape()[1];
@@ -81,7 +102,7 @@
     _scaled_results.reserve(_num_boxes);
     _scale.reserve(_num_boxes);
 
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         auto       crop_tensor = std::make_unique<Tensor>();
         TensorInfo crop_result_info(1, DataType::F32);
@@ -108,7 +129,7 @@
 {
     ARM_COMPUTE_ERROR_ON_MSG(_output == nullptr, "Unconfigured function");
 
-    for(unsigned int i = 0; i < _num_boxes; ++i)
+    for (unsigned int i = 0; i < _num_boxes; ++i)
     {
         // Size of the crop box in _boxes and thus the shape of _crop_results[i]
         // may not be known until run-time and so the kernels cannot be configured until then.
@@ -117,12 +138,15 @@
         NEScheduler::get().schedule(_crop[i].get(), Window::DimZ);
 
         // Scale the cropped image.
-        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(), ScaleKernelInfo{ _method, BorderMode::CONSTANT, PixelValue(_extrapolation_value), SamplingPolicy::TOP_LEFT, false });
+        _scale[i]->configure(_crop_results[i].get(), _scaled_results[i].get(),
+                             ScaleKernelInfo{_method, BorderMode::CONSTANT, PixelValue(_extrapolation_value),
+                                             SamplingPolicy::TOP_LEFT, false});
         _scaled_results[i]->allocator()->allocate();
         _scale[i]->run();
 
         // Copy scaled image into output.
-        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(), _output->ptr_to_element(Coordinates(0, 0, 0, i)));
+        std::copy_n(_scaled_results[i]->buffer(), _scaled_results[i]->info()->total_size(),
+                    _output->ptr_to_element(Coordinates(0, 0, 0, i)));
     }
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 439aff0..3987370 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp

@@ -25,9 +25,10 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
@@ -61,7 +62,8 @@
     deconv_pad_top += deconv_pad_y / 2;
     deconv_pad_bottom += deconv_pad_y / 2;
 
-    return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom, DimensionRoundingType::FLOOR);
+    return PadStrideInfo(stride_x, stride_y, deconv_pad_left, deconv_pad_right, deconv_pad_top, deconv_pad_bottom,
+                         DimensionRoundingType::FLOOR);
 }
 
 } // namespace
@@ -82,17 +84,24 @@
 {
 }
 
-Status NEDeconvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &info,
-                                      bool enable_fast_math, const WeightsInfo &weights_info)
+Status NEDeconvolutionLayer::validate(const ITensorInfo   *input,
+                                      const ITensorInfo   *weights,
+                                      const ITensorInfo   *bias,
+                                      const ITensorInfo   *output,
+                                      const PadStrideInfo &info,
+                                      bool                 enable_fast_math,
+                                      const WeightsInfo   &weights_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
-    const unsigned int width_idx  = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
-    const unsigned int height_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16, DataType::QASYMM8,
+                                                         DataType::QASYMM8_SIGNED);
+    const unsigned int width_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::WIDTH);
+    const unsigned int height_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::HEIGHT);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(height_idx) < 1);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(weights, input);
-    if(is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
+    if (is_data_type_quantized_per_channel(weights->data_type()) && is_data_type_quantized(input->data_type()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QSYMM8_PER_CHANNEL);
     }
@@ -101,11 +110,13 @@
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
 
-    auto out_dims = deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx), weights->dimension(width_idx), weights->dimension(height_idx), info);
+    auto out_dims =
+        deconvolution_output_dimensions(input->dimension(width_idx), input->dimension(height_idx),
+                                        weights->dimension(width_idx), weights->dimension(height_idx), info);
 
-    if(bias != nullptr)
+    if (bias != nullptr)
     {
-        if(is_data_type_quantized_asymmetric(input->data_type()))
+        if (is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32);
         }
@@ -115,15 +126,18 @@
         }
     }
 
-    if(output->tensor_shape().total_size() > 0)
+    if (output->tensor_shape().total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
 
         const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input, *weights);
 
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(), "Output's width is invalid.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(), "Output's height is invalid.");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(), "Output's depth is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimX) != output_shape.x(),
+                                        "Output's width is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimY) != output_shape.y(),
+                                        "Output's height is invalid.");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(Window::DimZ) != output_shape.z(),
+                                        "Output's depth is invalid.");
     }
 
     uint32_t           deconv_pad_x = 0;
@@ -141,44 +155,61 @@
     ARM_COMPUTE_RETURN_ERROR_ON((out_x - weights->dimension(idx_w) + 1) > out_dims.first);
     ARM_COMPUTE_RETURN_ERROR_ON((out_y - weights->dimension(idx_h) + 1) > out_dims.second);
 
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
-    TensorInfo          scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(*input, *weights, stride_x, stride_y,
+                                                                              out_dims, deconv_pad_x, deconv_pad_y);
+    TensorInfo scale_out_info(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(scale_out_shape));
     const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
 
     // Do not perform upsampling when the operation uses unit stride in all dimensions
     const bool do_upsampling = stride_x != 1 || stride_y != 1;
 
-    const unsigned int batches_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
-    const unsigned int channel_idx = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
+    const unsigned int batches_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
+    const unsigned int channel_idx =
+        get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::CHANNEL);
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(batches_idx) != scale_out_info.dimension(batches_idx));
     ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(channel_idx) != scale_out_info.dimension(channel_idx));
 
-    if(do_upsampling)
+    if (do_upsampling)
     {
         const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(&scale_out_info, weights, bias, output, conv_info,
+                                                                 weights_info, Size2D(1U, 1U), ActivationLayerInfo(),
+                                                                 enable_fast_math));
     }
     else
     {
-        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math));
+        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+                                      upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+        ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayer::validate(input, weights, bias, output, conv_info, weights_info,
+                                                                 Size2D(1U, 1U), ActivationLayerInfo(),
+                                                                 enable_fast_math));
     }
 
     return Status{};
 }
 
-void NEDeconvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &info, bool enable_fast_math, const WeightsInfo &weights_info)
+void NEDeconvolutionLayer::configure(ITensor             *input,
+                                     const ITensor       *weights,
+                                     const ITensor       *bias,
+                                     ITensor             *output,
+                                     const PadStrideInfo &info,
+                                     bool                 enable_fast_math,
+                                     const WeightsInfo   &weights_info)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info, enable_fast_math, weights_info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEDeconvolutionLayer::validate(input->info(), weights->info(),
+                                                              (bias == nullptr) ? nullptr : bias->info(),
+                                                              output->info(), info, enable_fast_math, weights_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, bias, output, info, enable_fast_math, weights_info);
 
     const DataLayout   data_layout = input->info()->data_layout();
     const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    auto               out_dims    = deconvolution_output_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
-                                                                     weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
+    auto               out_dims    = deconvolution_output_dimensions(
+                         input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+                         weights->info()->dimension(width_idx), weights->info()->dimension(height_idx), info);
 
     const TensorShape output_shape = compute_deconvolution_output_shape(out_dims, *input->info(), *weights->info());
 
@@ -191,7 +222,8 @@
     const unsigned int stride_y = info.stride().second;
 
     // Output auto initialization if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(),
+                       input->info()->quantization_info());
 
     _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
 
@@ -199,12 +231,11 @@
     _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
 
     // setup the function to convolve the upscaled output
-    uint32_t            deconv_pad_x    = 0;
-    uint32_t            deconv_pad_y    = 0;
-    const TensorShape   scale_out_shape = compute_deconvolution_upsampled_shape(*input->info(), *weights->info(),
-                                                                                stride_x, stride_y,
-                                                                                out_dims, deconv_pad_x, deconv_pad_y);
-    const PadStrideInfo upsample_info   = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
+    uint32_t          deconv_pad_x    = 0;
+    uint32_t          deconv_pad_y    = 0;
+    const TensorShape scale_out_shape = compute_deconvolution_upsampled_shape(
+        *input->info(), *weights->info(), stride_x, stride_y, out_dims, deconv_pad_x, deconv_pad_y);
+    const PadStrideInfo upsample_info = compute_upsample_info(info, deconv_pad_x, deconv_pad_y);
 
     // Do not perform upsampling when the operation uses unit stride in all dimensions
     _do_upsampling = stride_x != 1 || stride_y != 1;
@@ -216,12 +247,12 @@
     axis_data[1]   = static_cast<uint32_t>(height_idx);
 
     // Setup convolution and upsampling, if needed
-    if(_do_upsampling)
+    if (_do_upsampling)
     {
         _memory_group.manage(&_scaled_output);
 
         const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
-        TensorInfo          scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
+        TensorInfo scale_out_info(scale_out_shape, 1, input->info()->data_type(), input->info()->quantization_info());
         scale_out_info.set_data_layout(data_layout);
         _scaled_output.allocator()->init(scale_out_info);
 
@@ -229,14 +260,17 @@
         // The padding amount can be given as input to the convolution layer.
         _upsample_f.configure(input, &_scaled_output, upsample_info);
 
-        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+        _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+                          ActivationLayerInfo(), enable_fast_math);
 
         _scaled_output.allocator()->allocate();
     }
     else
     {
-        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(), upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
-        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U), ActivationLayerInfo(), enable_fast_math);
+        const PadStrideInfo conv_info(1, 1, upsample_info.pad_left(), upsample_info.pad_right(),
+                                      upsample_info.pad_top(), upsample_info.pad_bottom(), DimensionRoundingType::CEIL);
+        _conv_f.configure(input, &_weights_flipped, bias, output, conv_info, weights_info, Size2D(1U, 1U),
+                          ActivationLayerInfo(), enable_fast_math);
     }
 }
 
@@ -246,7 +280,7 @@
 
     MemoryGroupResourceScope scope_mg(_memory_group);
 
-    if(_do_upsampling)
+    if (_do_upsampling)
     {
         _upsample_f.run();
     }
@@ -255,7 +289,7 @@
 
 void NEDeconvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
 

diff --git a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
index 1ec3207..766635d 100644
--- a/src/runtime/NEON/functions/NEDepthConvertLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthConvertLayer.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEDepthConvertLayer.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuCast.h"
 
 #include <utility>
@@ -32,16 +33,15 @@
 {
 struct NEDepthConvertLayer::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<cpu::CpuCast> op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<cpu::CpuCast> op{nullptr};
 };
 
-NEDepthConvertLayer::NEDepthConvertLayer()
-    : _impl(std::make_unique<Impl>())
+NEDepthConvertLayer::NEDepthConvertLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&) = default;
+NEDepthConvertLayer::NEDepthConvertLayer(NEDepthConvertLayer &&)            = default;
 NEDepthConvertLayer &NEDepthConvertLayer::operator=(NEDepthConvertLayer &&) = default;
 NEDepthConvertLayer::~NEDepthConvertLayer()                                 = default;
 
@@ -59,7 +59,8 @@
     _impl->op->configure(_impl->src->info(), _impl->dst->info(), policy);
 }
 
-Status NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
+Status
+NEDepthConvertLayer::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(shift != 0);
     return cpu::CpuCast::validate(input, output, policy);
@@ -67,7 +68,7 @@
 
 void NEDepthConvertLayer::run()
 {
-    ITensorPack pack = { { ACL_SRC, _impl->src }, { ACL_DST, _impl->dst } };
+    ITensorPack pack = {{ACL_SRC, _impl->src}, {ACL_DST, _impl->dst}};
     _impl->op->run(pack);
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
index f4a8a17..4756405 100644
--- a/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthToSpaceLayer.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEDepthToSpaceLayerKernel.h"
 

diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 4dabef3..6c08564 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuDepthwiseConv2d.h"
 
@@ -39,38 +40,35 @@
 
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::Impl
 {
-    ITensor       *src{ nullptr }; // SRC_0
-    ITensor       *dst{ nullptr }; // DST_0
-    const ITensor *weights
-    {
-        nullptr
-    }; // SRC_1
-    const ITensor *biases
-    {
-        nullptr
-    };                                                           // SRC_2
+    ITensor                                 *src{nullptr};       // SRC_0
+    ITensor                                 *dst{nullptr};       // DST_0
+    const ITensor                           *weights{nullptr};   // SRC_1
+    const ITensor                           *biases{nullptr};    // SRC_2
     Tensor                                   permuted_input{};   // INT_0
     Tensor                                   permuted_weights{}; // INT_1
     Tensor                                   permuted_output{};  // INT_2
     Tensor                                   workspace{};        // INT_3
     Tensor                                   packed_weights{};   // INT_4
-    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
-    bool                                     is_prepared{ false };
-    bool                                     permute{ false };
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
+    bool                                     is_prepared{false};
+    bool                                     permute{false};
 };
 
-NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(std::shared_ptr<IMemoryManager> memory_manager)
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::NEDepthwiseConvolutionLayerOptimizedInternal(
+    std::shared_ptr<IMemoryManager> memory_manager)
     : _memory_group(memory_manager), _impl(std::make_unique<Impl>())
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(ITensor       *input,
-                                                                                          const ITensor *weights,
-                                                                                          const ITensor *biases,
-                                                                                          ITensor *output, const PadStrideInfo &conv_info,
-                                                                                          unsigned int               depth_multiplier,
-                                                                                          const ActivationLayerInfo &act_info,
-                                                                                          const Size2D              &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::configure(
+    ITensor                   *input,
+    const ITensor             *weights,
+    const ITensor             *biases,
+    ITensor                   *output,
+    const PadStrideInfo       &conv_info,
+    unsigned int               depth_multiplier,
+    const ActivationLayerInfo &act_info,
+    const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -82,9 +80,9 @@
     _impl->permute = is_nhwc;
 
     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
-    _impl->op->configure(_impl->src->info(), _impl->weights->info(), _impl->biases == nullptr ? nullptr : _impl->biases->info(),
-                         _impl->dst->info(), info);
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
+    _impl->op->configure(_impl->src->info(), _impl->weights->info(),
+                         _impl->biases == nullptr ? nullptr : _impl->biases->info(), _impl->dst->info(), info);
 
     // Configure pipeline
     ActivationLayerInfo act_info_to_use            = ActivationLayerInfo();
@@ -92,15 +90,15 @@
     const bool          is_relu6                   = arm_compute::utils::info_helpers::is_relu6(act_info);
     bool                is_activationlayer_enabled = act_info.enabled() && !(is_relu || is_relu6);
 
-    if(!is_activationlayer_enabled)
+    if (!is_activationlayer_enabled)
     {
         act_info_to_use = act_info;
     }
-    info = ConvolutionInfo{ conv_info, depth_multiplier, act_info_to_use, dilation };
+    info = ConvolutionInfo{conv_info, depth_multiplier, act_info_to_use, dilation};
 
     auto dwc_optimized_func = std::make_unique<cpu::CpuDepthwiseConv2dAssemblyDispatch>();
 
-    if(is_nhwc)
+    if (is_nhwc)
     {
         auto permute_input   = std::make_unique<cpu::CpuPermute>();
         auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -122,7 +120,9 @@
         _impl->permuted_output.info()->set_quantization_info(output->info()->quantization_info());
 
         // Configure optimized depthwise
-        dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(), biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(), info);
+        dwc_optimized_func->configure(_impl->permuted_input.info(), _impl->permuted_weights.info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->permuted_output.info(),
+                                      info);
 
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _impl->permuted_output.info()->set_data_layout(DataLayout::NHWC);
@@ -133,29 +133,33 @@
     }
     else
     {
-        dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(), biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
+        dwc_optimized_func->configure(_impl->src->info(), _impl->weights->info(),
+                                      biases == nullptr ? nullptr : biases->info(), _impl->dst->info(), info);
     }
 
     // Allocate memory based on the internal memory requirements
     experimental::MemoryRequirements mem_req = dwc_optimized_func->workspace();
-    _impl->workspace.allocator()->init(TensorInfo(TensorShape{ mem_req[0].size + mem_req[0].alignment }, 1, DataType::S8), mem_req[0].alignment);
-    _impl->packed_weights.allocator()->init(TensorInfo(TensorShape{ mem_req[1].size + mem_req[1].alignment }, 1, DataType::S8), mem_req[1].alignment);
+    _impl->workspace.allocator()->init(TensorInfo(TensorShape{mem_req[0].size + mem_req[0].alignment}, 1, DataType::S8),
+                                       mem_req[0].alignment);
+    _impl->packed_weights.allocator()->init(
+        TensorInfo(TensorShape{mem_req[1].size + mem_req[1].alignment}, 1, DataType::S8), mem_req[1].alignment);
     _memory_group.manage(&_impl->workspace);
     _memory_group.manage(&_impl->packed_weights);
     _impl->workspace.allocator()->allocate();
     _impl->packed_weights.allocator()->allocate();
 }
 
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo         *input,
-                                                                                           const ITensorInfo         *weights,
-                                                                                           const ITensorInfo         *biases,
-                                                                                           const ITensorInfo         *output,
-                                                                                           const PadStrideInfo       &conv_info,
-                                                                                           unsigned int               depth_multiplier,
-                                                                                           const ActivationLayerInfo &act_info,
-                                                                                           const Size2D              &dilation)
+Status
+NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::validate(const ITensorInfo   *input,
+                                                                                    const ITensorInfo   *weights,
+                                                                                    const ITensorInfo   *biases,
+                                                                                    const ITensorInfo   *output,
+                                                                                    const PadStrideInfo &conv_info,
+                                                                                    unsigned int depth_multiplier,
+                                                                                    const ActivationLayerInfo &act_info,
+                                                                                    const Size2D              &dilation)
 {
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
@@ -180,15 +184,15 @@
 
 void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerOptimizedInternal::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         // Permute weights
-        if(_impl->permute)
+        if (_impl->permute)
         {
             _impl->permuted_weights.allocator()->allocate();
         }
 
-        if(!_impl->permuted_weights.is_used())
+        if (!_impl->permuted_weights.is_used())
         {
             _impl->permuted_weights.allocator()->free();
         }
@@ -202,14 +206,14 @@
     Tensor                                   permuted_input{};
     Tensor                                   permuted_weights{};
     Tensor                                   permuted_output{};
-    bool                                     is_prepared{ false };
-    bool                                     is_nchw{ false };
-    bool                                     is_activationlayer_enabled{ false };
-    const ITensor                           *weights{ nullptr };
-    const ITensor                           *biases{ nullptr };
-    const ITensor                           *src{ nullptr };
-    ITensor                                 *dst{ nullptr };
-    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{ nullptr };
+    bool                                     is_prepared{false};
+    bool                                     is_nchw{false};
+    bool                                     is_activationlayer_enabled{false};
+    const ITensor                           *weights{nullptr};
+    const ITensor                           *biases{nullptr};
+    const ITensor                           *src{nullptr};
+    ITensor                                 *dst{nullptr};
+    std::shared_ptr<cpu::CpuDepthwiseConv2d> op{nullptr};
 };
 
 NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::NEDepthwiseConvolutionLayerGeneric()
@@ -217,14 +221,21 @@
 {
 }
 
-void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                                                                unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::configure(ITensor             *input,
+                                                                                const ITensor       *weights,
+                                                                                const ITensor       *biases,
+                                                                                ITensor             *output,
+                                                                                const PadStrideInfo &conv_info,
+                                                                                unsigned int         depth_multiplier,
+                                                                                const ActivationLayerInfo &act_info,
+                                                                                const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
-    const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     _impl->op = std::make_unique<cpu::CpuDepthwiseConv2d>();
-    _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(), info);
+    _impl->op->configure(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output->info(),
+                         info);
 
     _impl->src         = input;
     _impl->dst         = output;
@@ -236,7 +247,7 @@
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = output;
-    if(_impl->is_nchw)
+    if (_impl->is_nchw)
     {
         auto permute_input   = std::make_unique<cpu::CpuPermute>();
         auto permute_weights = std::make_unique<cpu::CpuPermute>();
@@ -249,14 +260,16 @@
         _impl->permuted_weights.info()->set_data_layout(DataLayout::NHWC);
         weights_to_use = &_impl->permuted_weights;
 
-        _impl->permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
+        _impl->permuted_output.allocator()->init(
+            output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(TensorShape()));
         output_to_use = &_impl->permuted_output;
     }
 
     auto depthwise_conv_kernel = std::make_unique<cpu::kernels::CpuDepthwiseConv2dNativeKernel>();
-    depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(), biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
+    depthwise_conv_kernel->configure(input_to_use->info(), weights_to_use->info(),
+                                     biases == nullptr ? nullptr : biases->info(), output_to_use->info(), info);
 
-    if(_impl->is_nchw)
+    if (_impl->is_nchw)
     {
         auto permute_output = std::make_unique<cpu::CpuPermute>();
         permute_output->configure(_impl->permuted_output.info(), output->info(), PermutationVector(1U, 2U, 0U));
@@ -268,11 +281,16 @@
     }
 }
 
-Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
+Status NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayerGeneric::validate(const ITensorInfo   *input,
+                                                                                 const ITensorInfo   *weights,
+                                                                                 const ITensorInfo   *biases,
+                                                                                 const ITensorInfo   *output,
                                                                                  const PadStrideInfo &conv_info,
-                                                                                 unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+                                                                                 unsigned int         depth_multiplier,
+                                                                                 const ActivationLayerInfo &act_info,
+                                                                                 const Size2D              &dilation)
 {
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
@@ -298,49 +316,64 @@
 #ifndef DOXYGEN_SKIP_THIS
 struct NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer::Impl
 {
-    DepthwiseConvolutionFunction                 depth_conv_func{ DepthwiseConvolutionFunction::OPTIMIZED };
-    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{ nullptr };
+    DepthwiseConvolutionFunction                 depth_conv_func{DepthwiseConvolutionFunction::OPTIMIZED};
+    NEDepthwiseConvolutionLayerOptimizedInternal func_optimized{nullptr};
     NEDepthwiseConvolutionLayerGeneric           func_generic{};
-    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{ nullptr };
+    std::shared_ptr<cpu::CpuDepthwiseConv2d>     op{nullptr};
 };
 #endif // DOXYGEN_SKIP_THIS
 
-void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier,
-                                            const ActivationLayerInfo &act_info, const Size2D &dilation)
+void NEDepthwiseConvolutionLayer::configure(ITensor                   *input,
+                                            const ITensor             *weights,
+                                            const ITensor             *biases,
+                                            ITensor                   *output,
+                                            const PadStrideInfo       &conv_info,
+                                            unsigned int               depth_multiplier,
+                                            const ActivationLayerInfo &act_info,
+                                            const Size2D              &dilation)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     ARM_COMPUTE_LOG_PARAMS(input, weights, output, conv_info, depth_multiplier, biases, act_info, dilation);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(),
-                                                                     output->info(), conv_info, depth_multiplier, act_info, dilation));
+    ARM_COMPUTE_ERROR_THROW_ON(NEDepthwiseConvolutionLayer::validate(
+        input->info(), weights->info(), (biases == nullptr) ? nullptr : biases->info(), output->info(), conv_info,
+        depth_multiplier, act_info, dilation));
 
-    const ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    const ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     _impl->op              = std::make_shared<cpu::CpuDepthwiseConv2d>();
-    _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
-                                                                          info);
-    switch(_impl->depth_conv_func)
+    _impl->depth_conv_func = _impl->op->get_depthwiseconvolution_function(
+        input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), info);
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
-            _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_optimized.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                            dilation);
             break;
         case DepthwiseConvolutionFunction::GENERIC:
-            _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info, dilation);
+            _impl->func_generic.configure(input, weights, biases, output, conv_info, depth_multiplier, act_info,
+                                          dilation);
             break;
         default:
             ARM_COMPUTE_ERROR("Unsupported DepthwiseConvolutionFunction");
     }
 }
 
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                             unsigned int depth_multiplier, const ActivationLayerInfo &act_info, const Size2D &dilation)
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo         *input,
+                                             const ITensorInfo         *weights,
+                                             const ITensorInfo         *biases,
+                                             const ITensorInfo         *output,
+                                             const PadStrideInfo       &conv_info,
+                                             unsigned int               depth_multiplier,
+                                             const ActivationLayerInfo &act_info,
+                                             const Size2D              &dilation)
 {
-    ConvolutionInfo info{ conv_info, depth_multiplier, act_info, dilation };
+    ConvolutionInfo info{conv_info, depth_multiplier, act_info, dilation};
     return cpu::CpuDepthwiseConv2d::validate(input, weights, biases, output, info);
 }
 
 void NEDepthwiseConvolutionLayer::run()
 {
-    switch(_impl->depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _impl->func_optimized.run();
@@ -355,7 +388,7 @@
 
 void NEDepthwiseConvolutionLayer::prepare()
 {
-    switch(_impl->depth_conv_func)
+    switch (_impl->depth_conv_func)
     {
         case DepthwiseConvolutionFunction::OPTIMIZED:
             _impl->func_optimized.prepare();

diff --git a/src/runtime/NEON/functions/NEDequantizationLayer.cpp b/src/runtime/NEON/functions/NEDequantizationLayer.cpp
index 83e0131..28d19d2 100644
--- a/src/runtime/NEON/functions/NEDequantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEDequantizationLayer.cpp

@@ -26,19 +26,19 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/cpu/operators/CpuDequantize.h"
 
 namespace arm_compute
 {
 struct NEDequantizationLayer::Impl
 {
-    const ITensor                      *src{ nullptr };
-    ITensor                            *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDequantize> op{ nullptr };
+    const ITensor                      *src{nullptr};
+    ITensor                            *dst{nullptr};
+    std::unique_ptr<cpu::CpuDequantize> op{nullptr};
 };
 
-NEDequantizationLayer::NEDequantizationLayer()
-    : _impl(std::make_unique<Impl>())
+NEDequantizationLayer::NEDequantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 NEDequantizationLayer::~NEDequantizationLayer() = default;

diff --git a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
index 1da8b01..b347390 100644
--- a/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp
+++ b/src/runtime/NEON/functions/NEDetectionPostProcessLayer.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 
 #include <cstddef>
@@ -35,24 +36,36 @@
 namespace arm_compute
 {
 NEDetectionPostProcessLayer::NEDetectionPostProcessLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _dequantize(), _detection_post_process(), _decoded_scores(), _run_dequantize(false)
+    : _memory_group(std::move(memory_manager)),
+      _dequantize(),
+      _detection_post_process(),
+      _decoded_scores(),
+      _run_dequantize(false)
 {
 }
 
-void NEDetectionPostProcessLayer::configure(const ITensor *input_box_encoding, const ITensor *input_scores, const ITensor *input_anchors,
-                                            ITensor *output_boxes, ITensor *output_classes, ITensor *output_scores, ITensor *num_detection, DetectionPostProcessLayerInfo info)
+void NEDetectionPostProcessLayer::configure(const ITensor                *input_box_encoding,
+                                            const ITensor                *input_scores,
+                                            const ITensor                *input_anchors,
+                                            ITensor                      *output_boxes,
+                                            ITensor                      *output_classes,
+                                            ITensor                      *output_scores,
+                                            ITensor                      *num_detection,
+                                            DetectionPostProcessLayerInfo info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores);
-    ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(), output_classes->info(),
-                                                                     output_scores->info(),
-                                                                     num_detection->info(), info));
-    ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes,
+                                 output_scores);
+    ARM_COMPUTE_ERROR_THROW_ON(NEDetectionPostProcessLayer::validate(
+        input_box_encoding->info(), input_scores->info(), input_anchors->info(), output_boxes->info(),
+        output_classes->info(), output_scores->info(), num_detection->info(), info));
+    ARM_COMPUTE_LOG_PARAMS(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores,
+                           num_detection, info);
 
     const ITensor                *input_scores_to_use = input_scores;
     DetectionPostProcessLayerInfo info_to_use         = info;
     _run_dequantize                                   = is_data_type_quantized(input_box_encoding->info()->data_type());
 
-    if(_run_dequantize)
+    if (_run_dequantize)
     {
         _memory_group.manage(&_decoded_scores);
 
@@ -61,26 +74,37 @@
         input_scores_to_use = &_decoded_scores;
 
         // Create a new info struct to avoid dequantizing in the CPP layer
-        std::array<float, 4> scales_values{ info.scale_value_y(), info.scale_value_x(), info.scale_value_h(), info.scale_value_w() };
-        DetectionPostProcessLayerInfo info_quantized(info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(), info.num_classes(),
-                                                     scales_values, info.use_regular_nms(), info.detection_per_class(), false);
+        std::array<float, 4>          scales_values{info.scale_value_y(), info.scale_value_x(), info.scale_value_h(),
+                                           info.scale_value_w()};
+        DetectionPostProcessLayerInfo info_quantized(
+            info.max_detections(), info.max_classes_per_detection(), info.nms_score_threshold(), info.iou_threshold(),
+            info.num_classes(), scales_values, info.use_regular_nms(), info.detection_per_class(), false);
         info_to_use = info_quantized;
     }
 
-    _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes, output_classes, output_scores, num_detection, info_to_use);
+    _detection_post_process.configure(input_box_encoding, input_scores_to_use, input_anchors, output_boxes,
+                                      output_classes, output_scores, num_detection, info_to_use);
     _decoded_scores.allocator()->allocate();
 }
 
-Status NEDetectionPostProcessLayer::validate(const ITensorInfo *input_box_encoding, const ITensorInfo *input_scores, const ITensorInfo *input_anchors,
-                                             ITensorInfo *output_boxes, ITensorInfo *output_classes, ITensorInfo *output_scores, ITensorInfo *num_detection, DetectionPostProcessLayerInfo info)
+Status NEDetectionPostProcessLayer::validate(const ITensorInfo            *input_box_encoding,
+                                             const ITensorInfo            *input_scores,
+                                             const ITensorInfo            *input_anchors,
+                                             ITensorInfo                  *output_boxes,
+                                             ITensorInfo                  *output_classes,
+                                             ITensorInfo                  *output_scores,
+                                             ITensorInfo                  *num_detection,
+                                             DetectionPostProcessLayerInfo info)
 {
     bool run_dequantize = is_data_type_quantized(input_box_encoding->data_type());
-    if(run_dequantize)
+    if (run_dequantize)
     {
         TensorInfo decoded_classes_info = input_scores->clone()->set_is_resizable(true).set_data_type(DataType::F32);
         ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(input_scores, &decoded_classes_info));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors, output_boxes, output_classes, output_scores, num_detection, info));
+    ARM_COMPUTE_RETURN_ON_ERROR(CPPDetectionPostProcessLayer::validate(input_box_encoding, input_scores, input_anchors,
+                                                                       output_boxes, output_classes, output_scores,
+                                                                       num_detection, info));
 
     return Status{};
 }
@@ -90,7 +114,7 @@
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Decode scores if necessary
-    if(_run_dequantize)
+    if (_run_dequantize)
     {
         _dequantize.run();
     }

diff --git a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
index ef3d3d6..f1c2cf9 100644
--- a/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDirectConvolutionLayer.cpp

@@ -27,17 +27,18 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/cpu/operators/CpuDirectConv2d.h"
 
 namespace arm_compute
 {
 struct NEDirectConvolutionLayer::Impl
 {
-    ITensor                              *src{ nullptr };
-    const ITensor                        *weights{ nullptr };
-    const ITensor                        *bias{ nullptr };
-    ITensor                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuDirectConv2d> op{ nullptr };
+    ITensor                              *src{nullptr};
+    const ITensor                        *weights{nullptr};
+    const ITensor                        *bias{nullptr};
+    ITensor                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuDirectConv2d> op{nullptr};
 };
 
 NEDirectConvolutionLayer::NEDirectConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -46,17 +47,27 @@
 }
 NEDirectConvolutionLayer::~NEDirectConvolutionLayer() = default;
 
-void NEDirectConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *bias, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void NEDirectConvolutionLayer::configure(ITensor                   *input,
+                                         const ITensor             *weights,
+                                         const ITensor             *bias,
+                                         ITensor                   *output,
+                                         const PadStrideInfo       &conv_info,
+                                         const ActivationLayerInfo &act_info)
 {
     _impl->src     = input;
     _impl->weights = weights;
     _impl->bias    = bias;
     _impl->dst     = output;
     _impl->op      = std::make_unique<cpu::CpuDirectConv2d>(_memory_manager);
-    _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(), conv_info, act_info);
+    _impl->op->configure(input->info(), weights->info(), (bias != nullptr ? bias->info() : nullptr), output->info(),
+                         conv_info, act_info);
 }
 
-Status NEDirectConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status NEDirectConvolutionLayer::validate(const ITensorInfo         *input,
+                                          const ITensorInfo         *weights,
+                                          const ITensorInfo         *bias,
+                                          const ITensorInfo         *output,
+                                          const PadStrideInfo       &conv_info,
                                           const ActivationLayerInfo &act_info)
 {
     return cpu::CpuDirectConv2d::validate(input, weights, bias, output, conv_info, act_info);

diff --git a/src/runtime/NEON/functions/NEElementwiseOperations.cpp b/src/runtime/NEON/functions/NEElementwiseOperations.cpp
index c958adf..685ef2d 100644
--- a/src/runtime/NEON/functions/NEElementwiseOperations.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseOperations.cpp

@@ -22,10 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseOperations.h"
-#include "arm_compute/core/Validate.h"
-#include "src/cpu/operators/CpuElementwise.h"
 
 #include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Validate.h"
+
+#include "src/cpu/operators/CpuElementwise.h"
 
 #include <utility>
 
@@ -33,17 +34,16 @@
 {
 struct NEElementwiseMax::Impl
 {
-    const ITensor                          *src_0{ nullptr };
-    const ITensor                          *src_1{ nullptr };
-    ITensor                                *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseMax> op{ nullptr };
+    const ITensor                          *src_0{nullptr};
+    const ITensor                          *src_1{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseMax> op{nullptr};
 };
 
-NEElementwiseMax::NEElementwiseMax()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseMax::NEElementwiseMax() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&) = default;
+NEElementwiseMax::NEElementwiseMax(NEElementwiseMax &&)            = default;
 NEElementwiseMax &NEElementwiseMax::operator=(NEElementwiseMax &&) = default;
 NEElementwiseMax::~NEElementwiseMax()                              = default;
 
@@ -57,7 +57,10 @@
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseMax::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMax::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseMax::validate(input1, input2, output);
@@ -74,17 +77,16 @@
 
 struct NEElementwiseMin::Impl
 {
-    const ITensor                          *src_0{ nullptr };
-    const ITensor                          *src_1{ nullptr };
-    ITensor                                *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseMin> op{ nullptr };
+    const ITensor                          *src_0{nullptr};
+    const ITensor                          *src_1{nullptr};
+    ITensor                                *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseMin> op{nullptr};
 };
 
-NEElementwiseMin::NEElementwiseMin()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseMin::NEElementwiseMin() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&) = default;
+NEElementwiseMin::NEElementwiseMin(NEElementwiseMin &&)            = default;
 NEElementwiseMin &NEElementwiseMin::operator=(NEElementwiseMin &&) = default;
 NEElementwiseMin::~NEElementwiseMin()                              = default;
 
@@ -98,7 +100,10 @@
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseMin::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseMin::validate(const ITensorInfo         *input1,
+                                  const ITensorInfo         *input2,
+                                  const ITensorInfo         *output,
+                                  const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseMin::validate(input1, input2, output);
@@ -115,21 +120,23 @@
 
 struct NEElementwiseSquaredDiff::Impl
 {
-    const ITensor                                  *src_0{ nullptr };
-    const ITensor                                  *src_1{ nullptr };
-    ITensor                                        *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{ nullptr };
+    const ITensor                                  *src_0{nullptr};
+    const ITensor                                  *src_1{nullptr};
+    ITensor                                        *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseSquaredDiff> op{nullptr};
 };
 
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&) = default;
+NEElementwiseSquaredDiff::NEElementwiseSquaredDiff(NEElementwiseSquaredDiff &&)            = default;
 NEElementwiseSquaredDiff &NEElementwiseSquaredDiff::operator=(NEElementwiseSquaredDiff &&) = default;
 NEElementwiseSquaredDiff::~NEElementwiseSquaredDiff()                                      = default;
 
-void NEElementwiseSquaredDiff::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseSquaredDiff::configure(ITensor                   *input1,
+                                         ITensor                   *input2,
+                                         ITensor                   *output,
+                                         const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     _impl->src_0 = input1;
@@ -139,7 +146,10 @@
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseSquaredDiff::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseSquaredDiff::validate(const ITensorInfo         *input1,
+                                          const ITensorInfo         *input2,
+                                          const ITensorInfo         *output,
+                                          const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseSquaredDiff::validate(input1, input2, output);
@@ -156,21 +166,23 @@
 
 struct NEElementwiseDivision::Impl
 {
-    const ITensor                               *src_0{ nullptr };
-    const ITensor                               *src_1{ nullptr };
-    ITensor                                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseDivision> op{ nullptr };
+    const ITensor                               *src_0{nullptr};
+    const ITensor                               *src_1{nullptr};
+    ITensor                                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseDivision> op{nullptr};
 };
 
-NEElementwiseDivision::NEElementwiseDivision()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseDivision::NEElementwiseDivision() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&) = default;
+NEElementwiseDivision::NEElementwiseDivision(NEElementwiseDivision &&)            = default;
 NEElementwiseDivision &NEElementwiseDivision::operator=(NEElementwiseDivision &&) = default;
 NEElementwiseDivision::~NEElementwiseDivision()                                   = default;
 
-void NEElementwiseDivision::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwiseDivision::configure(ITensor                   *input1,
+                                      ITensor                   *input2,
+                                      ITensor                   *output,
+                                      const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     _impl->src_0 = input1;
@@ -180,7 +192,10 @@
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwiseDivision::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwiseDivision::validate(const ITensorInfo         *input1,
+                                       const ITensorInfo         *input2,
+                                       const ITensorInfo         *output,
+                                       const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwiseDivision::validate(input1, input2, output);
@@ -197,21 +212,23 @@
 
 struct NEElementwisePower::Impl
 {
-    const ITensor                            *src_0{ nullptr };
-    const ITensor                            *src_1{ nullptr };
-    ITensor                                  *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwisePower> op{ nullptr };
+    const ITensor                            *src_0{nullptr};
+    const ITensor                            *src_1{nullptr};
+    ITensor                                  *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwisePower> op{nullptr};
 };
 
-NEElementwisePower::NEElementwisePower()
-    : _impl(std::make_unique<Impl>())
+NEElementwisePower::NEElementwisePower() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwisePower::NEElementwisePower(NEElementwisePower &&) = default;
+NEElementwisePower::NEElementwisePower(NEElementwisePower &&)            = default;
 NEElementwisePower &NEElementwisePower::operator=(NEElementwisePower &&) = default;
 NEElementwisePower::~NEElementwisePower()                                = default;
 
-void NEElementwisePower::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEElementwisePower::configure(ITensor                   *input1,
+                                   ITensor                   *input2,
+                                   ITensor                   *output,
+                                   const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_UNUSED(act_info);
     _impl->src_0 = input1;
@@ -221,7 +238,10 @@
     _impl->op->configure(input1->info(), input2->info(), output->info());
 }
 
-Status NEElementwisePower::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEElementwisePower::validate(const ITensorInfo         *input1,
+                                    const ITensorInfo         *input2,
+                                    const ITensorInfo         *output,
+                                    const ActivationLayerInfo &act_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled());
     return cpu::CpuElementwisePower::validate(input1, input2, output);
@@ -239,22 +259,22 @@
 template <ComparisonOperation COP>
 struct NEElementwiseComparisonStatic<COP>::Impl
 {
-    const ITensor                                            *src_0{ nullptr };
-    const ITensor                                            *src_1{ nullptr };
-    ITensor                                                  *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{ nullptr };
+    const ITensor                                            *src_0{nullptr};
+    const ITensor                                            *src_1{nullptr};
+    ITensor                                                  *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseComparisonStatic<COP>> op{nullptr};
 };
 
 template <ComparisonOperation COP>
-NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic() : _impl(std::make_unique<Impl>())
 {
 }
 template <ComparisonOperation COP>
 NEElementwiseComparisonStatic<COP>::NEElementwiseComparisonStatic(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation       COP>
-NEElementwiseComparisonStatic<COP> &NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
-template <ComparisonOperation       COP>
+template <ComparisonOperation COP>
+NEElementwiseComparisonStatic<COP> &
+NEElementwiseComparisonStatic<COP>::operator=(NEElementwiseComparisonStatic &&) = default;
+template <ComparisonOperation COP>
 NEElementwiseComparisonStatic<COP>::~NEElementwiseComparisonStatic() = default;
 
 template <ComparisonOperation COP>
@@ -268,13 +288,15 @@
 }
 
 template <ComparisonOperation COP>
-Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output)
+Status NEElementwiseComparisonStatic<COP>::validate(const ITensorInfo *input1,
+                                                    const ITensorInfo *input2,
+                                                    const ITensorInfo *output)
 {
     return cpu::CpuElementwiseComparisonStatic<COP>::validate(input1, input2, output);
 }
 
 template <ComparisonOperation COP>
-void                          NEElementwiseComparisonStatic<COP>::run()
+void NEElementwiseComparisonStatic<COP>::run()
 {
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC_0, _impl->src_0);
@@ -285,17 +307,16 @@
 
 struct NEElementwiseComparison::Impl
 {
-    const ITensor                                 *src_0{ nullptr };
-    const ITensor                                 *src_1{ nullptr };
-    ITensor                                       *dst{ nullptr };
-    std::unique_ptr<cpu::CpuElementwiseComparison> op{ nullptr };
+    const ITensor                                 *src_0{nullptr};
+    const ITensor                                 *src_1{nullptr};
+    ITensor                                       *dst{nullptr};
+    std::unique_ptr<cpu::CpuElementwiseComparison> op{nullptr};
 };
 
-NEElementwiseComparison::NEElementwiseComparison()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseComparison::NEElementwiseComparison() : _impl(std::make_unique<Impl>())
 {
 }
-NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&) = default;
+NEElementwiseComparison::NEElementwiseComparison(NEElementwiseComparison &&)            = default;
 NEElementwiseComparison &NEElementwiseComparison::operator=(NEElementwiseComparison &&) = default;
 NEElementwiseComparison::~NEElementwiseComparison()                                     = default;
 
@@ -308,7 +329,10 @@
     _impl->op->configure(input1->info(), input2->info(), output->info(), op);
 }
 
-Status NEElementwiseComparison::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, ComparisonOperation op)
+Status NEElementwiseComparison::validate(const ITensorInfo  *input1,
+                                         const ITensorInfo  *input2,
+                                         const ITensorInfo  *output,
+                                         ComparisonOperation op)
 {
     return cpu::CpuElementwiseComparison::validate(input1, input2, output, op);
 }

diff --git a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
index a0674ec..23a092c 100644
--- a/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp
+++ b/src/runtime/NEON/functions/NEElementwiseUnaryLayer.cpp

@@ -22,7 +22,9 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEElementwiseUnaryLayer.h"
+
 #include "src/cpu/operators/CpuElementwiseUnary.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -32,21 +34,20 @@
 template <ElementWiseUnary op>
 struct NEElementwiseUnaryLayer<op>::Impl
 {
-    const ITensor                *src{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<OperatorType> cpu_op{ nullptr };
+    const ITensor                *src{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<OperatorType> cpu_op{nullptr};
 };
 
 template <ElementWiseUnary op>
-NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer()
-    : _impl(std::make_unique<Impl>())
+NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer() : _impl(std::make_unique<Impl>())
 {
 }
 template <ElementWiseUnary op>
 NEElementwiseUnaryLayer<op>::~NEElementwiseUnaryLayer() = default;
 template <ElementWiseUnary op>
 NEElementwiseUnaryLayer<op>::NEElementwiseUnaryLayer(NEElementwiseUnaryLayer &&) = default;
-template <ElementWiseUnary   op>
+template <ElementWiseUnary op>
 NEElementwiseUnaryLayer<op> &NEElementwiseUnaryLayer<op>::operator=(NEElementwiseUnaryLayer &&) = default;
 
 template <ElementWiseUnary op>
@@ -65,7 +66,7 @@
 }
 
 template <ElementWiseUnary op>
-void                       NEElementwiseUnaryLayer<op>::run()
+void NEElementwiseUnaryLayer<op>::run()
 {
     ITensorPack pack;
     pack.add_tensor(TensorType::ACL_SRC, _impl->src);

diff --git a/src/runtime/NEON/functions/NEFFT1D.cpp b/src/runtime/NEON/functions/NEFFT1D.cpp
index 343b817..fb75f9d 100644
--- a/src/runtime/NEON/functions/NEFFT1D.cpp
+++ b/src/runtime/NEON/functions/NEFFT1D.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
@@ -37,7 +38,15 @@
 NEFFT1D::~NEFFT1D() = default;
 
 NEFFT1D::NEFFT1D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _digit_reverse_kernel(), _fft_kernels(), _scale_kernel(), _digit_reversed_input(), _digit_reverse_indices(), _num_ffts(0), _axis(0), _run_scale(false)
+    : _memory_group(std::move(memory_manager)),
+      _digit_reverse_kernel(),
+      _fft_kernels(),
+      _scale_kernel(),
+      _digit_reversed_input(),
+      _digit_reverse_indices(),
+      _num_ffts(0),
+      _axis(0),
+      _run_scale(false)
 {
 }
 
@@ -74,7 +83,7 @@
     _fft_kernels.resize(_num_ffts);
     _axis = config.axis;
 
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         const unsigned int radix_for_stage = decomposed_vector.at(i);
 
@@ -84,19 +93,21 @@
         fft_kernel_info.Nx             = Nx;
         fft_kernel_info.is_first_stage = (i == 0);
         _fft_kernels[i]                = std::make_unique<NEFFTRadixStageKernel>();
-        _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr, fft_kernel_info);
+        _fft_kernels[i]->configure(&_digit_reversed_input, ((i == (_num_ffts - 1)) && !is_c2r) ? output : nullptr,
+                                   fft_kernel_info);
 
         Nx *= radix_for_stage;
     }
 
     // Configure scale kernel
-    if(_run_scale)
+    if (_run_scale)
     {
         FFTScaleKernelInfo scale_config;
         scale_config.scale     = static_cast<float>(N);
         scale_config.conjugate = config.direction == FFTDirection::Inverse;
         _scale_kernel          = std::make_unique<NEFFTScaleKernel>();
-        is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config) : _scale_kernel->configure(output, nullptr, scale_config);
+        is_c2r ? _scale_kernel->configure(&_digit_reversed_input, output, scale_config)
+                        : _scale_kernel->configure(output, nullptr, scale_config);
     }
 
     // Allocate tensors
@@ -113,7 +124,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() != DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_channels() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({ 0, 1 }).count(config.axis) == 0);
+    ARM_COMPUTE_RETURN_ERROR_ON(std::set<unsigned int>({0, 1}).count(config.axis) == 0);
 
     // Check if FFT is decomposable
     const auto         supported_radix   = NEFFTRadixStageKernel::supported_radix();
@@ -122,7 +133,7 @@
     ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty());
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         // All combinations are supported except real input with real output (i.e., both input channels set to 1)
         ARM_COMPUTE_RETURN_ERROR_ON(output->num_channels() == 1 && input->num_channels() == 1);
@@ -140,13 +151,13 @@
 
     NEScheduler::get().schedule(_digit_reverse_kernel.get(), (_axis == 0 ? Window::DimY : Window::DimZ));
 
-    for(unsigned int i = 0; i < _num_ffts; ++i)
+    for (unsigned int i = 0; i < _num_ffts; ++i)
     {
         NEScheduler::get().schedule(_fft_kernels[i].get(), (_axis == 0 ? Window::DimY : Window::DimX));
     }
 
     // Run output scaling
-    if(_run_scale)
+    if (_run_scale)
     {
         NEScheduler::get().schedule(_scale_kernel.get(), Window::DimY);
     }

diff --git a/src/runtime/NEON/functions/NEFFT2D.cpp b/src/runtime/NEON/functions/NEFFT2D.cpp
index ab422bd..0669092 100644
--- a/src/runtime/NEON/functions/NEFFT2D.cpp
+++ b/src/runtime/NEON/functions/NEFFT2D.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Scheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -33,7 +34,10 @@
 NEFFT2D::~NEFFT2D() = default;
 
 NEFFT2D::NEFFT2D(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _first_pass_func(memory_manager), _second_pass_func(memory_manager), _first_pass_tensor()
+    : _memory_group(memory_manager),
+      _first_pass_func(memory_manager),
+      _second_pass_func(memory_manager),
+      _first_pass_tensor()
 {
 }
 
@@ -78,7 +82,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(NEFFT1D::validate(&first_pass_tensor, output, second_pass_config));
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);

diff --git a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
index 0551d75..94f85e5 100644
--- a/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEFFTConvolutionLayer.cpp

@@ -25,15 +25,16 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/NEON/kernels/NEFFTDigitReverseKernel.h"
 #include "src/core/NEON/kernels/NEFFTRadixStageKernel.h"
 #include "src/core/NEON/kernels/NEFFTScaleKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/utils/helpers/fft.h"
 
 namespace arm_compute
@@ -46,11 +47,11 @@
 
     int  pad           = 0;
     bool is_decomposed = false;
-    while(!is_decomposed)
+    while (!is_decomposed)
     {
         const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N++, supported_radix);
         is_decomposed                = !decomposed_vector.empty();
-        if(!is_decomposed)
+        if (!is_decomposed)
         {
             ++pad;
         }
@@ -102,8 +103,13 @@
 }
 NEFFTConvolutionLayer::~NEFFTConvolutionLayer() = default;
 
-void NEFFTConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info,
-                                      const ActivationLayerInfo &act_info, bool enable_fast_math)
+void NEFFTConvolutionLayer::configure(ITensor                   *input,
+                                      const ITensor             *weights,
+                                      const ITensor             *biases,
+                                      ITensor                   *output,
+                                      const PadStrideInfo       &conv_info,
+                                      const ActivationLayerInfo &act_info,
+                                      bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, conv_info, act_info, enable_fast_math);
@@ -115,21 +121,24 @@
     _has_bias = biases != nullptr;
 
     // Get indices for the width and height
-    const size_t idx_width  = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
-    const size_t idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
+    const size_t idx_width = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::WIDTH);
+    const size_t idx_height =
+        get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
 
     // Input shape, kernel size and output tile
-    const Size2D input_dims  = Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
-    const Size2D kernel_size = Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
-    const Size2D pad_valid   = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
-                                      pad_decomposable(input_dims.y() + kernel_size.y() - 1));
+    const Size2D input_dims =
+        Size2D(input->info()->tensor_shape()[idx_width], input->info()->tensor_shape()[idx_height]);
+    const Size2D kernel_size =
+        Size2D(weights->info()->tensor_shape()[idx_width], weights->info()->tensor_shape()[idx_height]);
+    const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
+                                    pad_decomposable(input_dims.y() + kernel_size.y() - 1));
     // Tensors to use
     ITensor       *input_to_use   = input;
     const ITensor *weights_to_use = weights;
     ITensor       *output_to_use  = _has_bias ? &_bias_output : output;
 
     // Permute bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         _permute_bias_func.configure(biases, &_permuted_bias, PermutationVector(1U, 2U, 0U));
         _permuted_bias.info()->set_data_layout(DataLayout::NCHW);
@@ -137,7 +146,7 @@
 
     // Permute input if needed
     _needs_permute = input->info()->data_layout() == DataLayout::NHWC;
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _memory_group.manage(&_permuted_input);
         // Configure the function to transform the input tensor from NHWC -> NCHW
@@ -158,7 +167,7 @@
     _flip_weights_func.configure(weights_to_use, &_flipped_weights, &_flip_axis);
 
     // Pad weights
-    const PaddingList padding_w = { { 0, input_dims.x() + pad_valid.x() - 1 }, { 0, input_dims.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
     _pad_weights_func.configure(&_flipped_weights, &_padded_weights, padding_w);
 
     // Transform weights
@@ -166,10 +175,10 @@
     _transform_weights_func->configure(&_padded_weights, &_transformed_weights, FFT2DInfo());
 
     // Pad input
-    const PaddingList padding_in = { { 0, kernel_size.x() + pad_valid.x() - 1 }, { 0, kernel_size.y() + pad_valid.y() - 1 } };
+    const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
     _memory_group.manage(&_padded_input);
     _pad_input_func.configure(input_to_use, &_padded_input, padding_in);
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permuted_input.allocator()->allocate();
     }
@@ -193,7 +202,8 @@
     _memory_group.manage(&_itransformed_output);
     FFT2DInfo itranform_info;
     itranform_info.direction = FFTDirection::Inverse;
-    _itransformed_output.allocator()->init(_output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
+    _itransformed_output.allocator()->init(
+        _output_reduced.info()->clone()->set_is_resizable(true).set_num_channels(1).reset_padding());
     _itransform_output_func.configure(&_output_reduced, &_itransformed_output, itranform_info);
     _output_reduced.allocator()->allocate();
 
@@ -205,26 +215,29 @@
     // Extract correct region
     const int start_left = kernel_size.x() - conv_info.pad_left() - 1;
     const int start_top  = kernel_size.y() - conv_info.pad_top() - 1;
-    const int end_right  = _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
-    const int end_botton = _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
-    if(_has_bias)
+    const int end_right =
+        _reshaped_output.info()->tensor_shape().x() - (kernel_size.x() - conv_info.pad_right() - 1) - pad_valid.x();
+    const int end_botton =
+        _reshaped_output.info()->tensor_shape().y() - (kernel_size.y() - conv_info.pad_bottom() - 1) - pad_valid.y();
+    if (_has_bias)
     {
         _memory_group.manage(&_bias_output);
     }
-    else if(_needs_permute)
+    else if (_needs_permute)
     {
         output_to_use = &_permuted_output;
         _memory_group.manage(&_permuted_output);
     }
-    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
+    _extract_output_func.configure(&_reshaped_output, output_to_use, Coordinates(start_left, start_top),
+                                   Coordinates(end_right, end_botton));
     _reshaped_output.allocator()->allocate();
     _itransformed_output.allocator()->allocate();
 
     // Add bias
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         output_to_use = output;
-        if(_needs_permute)
+        if (_needs_permute)
         {
             output_to_use = &_permuted_output;
             _memory_group.manage(&_permuted_output);
@@ -235,7 +248,7 @@
     }
 
     // Permute output
-    if(_needs_permute)
+    if (_needs_permute)
     {
         // Configure the function to transform the convoluted output to ACL's native ordering format NCHW
         _permuted_output.info()->set_data_layout(DataLayout::NCHW);
@@ -247,7 +260,7 @@
 
     // Configure Activation Layer
     _is_activationlayer_enabled = act_info.enabled();
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.configure(output, nullptr, act_info);
     }
@@ -260,8 +273,13 @@
     axis_data[1]   = 1;
 }
 
-Status NEFFTConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                       const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEFFTConvolutionLayer::validate(const ITensorInfo         *input,
+                                       const ITensorInfo         *weights,
+                                       const ITensorInfo         *biases,
+                                       const ITensorInfo         *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math)
 {
     ARM_COMPUTE_UNUSED(enable_fast_math);
 
@@ -279,11 +297,13 @@
     const auto strides = conv_info.stride();
     ARM_COMPUTE_RETURN_ERROR_ON(strides.first != strides.second && strides.first != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(kernel_size.x() != kernel_size.y());
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) || conv_info.pad_right() != (kernel_size.x() / 2));
-    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) || conv_info.pad_bottom() != (kernel_size.y() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_left() != (kernel_size.x() / 2) ||
+                                conv_info.pad_right() != (kernel_size.x() / 2));
+    ARM_COMPUTE_RETURN_ERROR_ON(conv_info.pad_top() != (kernel_size.y() / 2) ||
+                                conv_info.pad_bottom() != (kernel_size.y() / 2));
 
     // Validate biases
-    if(biases != nullptr)
+    if (biases != nullptr)
     {
         const size_t idx_channels = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
@@ -291,13 +311,14 @@
     }
 
     // Checks performed when output is configured
-    if((output != nullptr) && (output->total_size() != 0))
+    if ((output != nullptr) && (output->total_size() != 0))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
-        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) || (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] != output->tensor_shape()[idx_height]) ||
+                                    (input->tensor_shape()[idx_width] != output->tensor_shape()[idx_width]));
 
         // Validate Activation Layer
-        if(act_info.enabled())
+        if (act_info.enabled())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output, nullptr, act_info));
         }
@@ -313,7 +334,7 @@
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Transform input
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_input_func.run();
     }
@@ -331,17 +352,17 @@
     _extract_output_func.run();
 
     // Add bias
-    if(_has_bias)
+    if (_has_bias)
     {
         _bias_add_func.run();
     }
-    if(_needs_permute)
+    if (_needs_permute)
     {
         _permute_output_func.run();
     }
 
     // Run activation layer
-    if(_is_activationlayer_enabled)
+    if (_is_activationlayer_enabled)
     {
         _activation_layer_func.run();
     }
@@ -349,10 +370,10 @@
 
 void NEFFTConvolutionLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         // Permute bias to NCHW
-        if(_original_bias != nullptr)
+        if (_original_bias != nullptr)
         {
             _permuted_bias.allocator()->allocate();
             _permute_bias_func.run();
@@ -362,7 +383,7 @@
         const ITensor *cur_weights = _original_weights;
 
         // Permute weights
-        if(_needs_permute)
+        if (_needs_permute)
         {
             ARM_COMPUTE_ERROR_ON(!cur_weights->is_used());
 

diff --git a/src/runtime/NEON/functions/NEFill.cpp b/src/runtime/NEON/functions/NEFill.cpp
index 4366778..bc1d5b7 100644
--- a/src/runtime/NEON/functions/NEFill.cpp
+++ b/src/runtime/NEON/functions/NEFill.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuFill.h"
 
 #include <utility>
@@ -32,15 +33,14 @@
 {
 struct NEFill::Impl
 {
-    ITensor                      *tensor{ nullptr };
-    std::unique_ptr<cpu::CpuFill> op{ nullptr };
+    ITensor                      *tensor{nullptr};
+    std::unique_ptr<cpu::CpuFill> op{nullptr};
 };
 
-NEFill::NEFill()
-    : _impl(std::make_unique<Impl>())
+NEFill::NEFill() : _impl(std::make_unique<Impl>())
 {
 }
-NEFill::NEFill(NEFill &&) = default;
+NEFill::NEFill(NEFill &&)            = default;
 NEFill &NEFill::operator=(NEFill &&) = default;
 NEFill::~NEFill()                    = default;
 

diff --git a/src/runtime/NEON/functions/NEFillBorder.cpp b/src/runtime/NEON/functions/NEFillBorder.cpp
index d633e34..a3ab9c3 100644
--- a/src/runtime/NEON/functions/NEFillBorder.cpp
+++ b/src/runtime/NEON/functions/NEFillBorder.cpp

@@ -25,17 +25,20 @@
 
 #include "arm_compute/core/Window.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 
 namespace arm_compute
 {
-NEFillBorder::NEFillBorder()
-    : _border_handler(nullptr)
+NEFillBorder::NEFillBorder() : _border_handler(nullptr)
 {
 }
 
-void NEFillBorder::configure(ITensor *input, unsigned int border_width, BorderMode border_mode, const PixelValue &constant_border_value)
+void NEFillBorder::configure(ITensor          *input,
+                             unsigned int      border_width,
+                             BorderMode        border_mode,
+                             const PixelValue &constant_border_value)
 {
     ARM_COMPUTE_LOG_PARAMS(input, border_width, border_mode, constant_border_value);
     _border_handler = std::make_unique<NEFillBorderKernel>();

diff --git a/src/runtime/NEON/functions/NEFlattenLayer.cpp b/src/runtime/NEON/functions/NEFlattenLayer.cpp
index f435842..56db2be 100644
--- a/src/runtime/NEON/functions/NEFlattenLayer.cpp
+++ b/src/runtime/NEON/functions/NEFlattenLayer.cpp

@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NEFlattenLayer.h"
 
 #include "arm_compute/core/TensorInfo.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/cpu/operators/CpuFlatten.h"
 
@@ -33,16 +34,15 @@
 {
 struct NEFlattenLayer::Impl
 {
-    const ITensor                   *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    std::unique_ptr<cpu::CpuFlatten> op{ nullptr };
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuFlatten> op{nullptr};
 };
 
-NEFlattenLayer::NEFlattenLayer()
-    : _impl(std::make_unique<Impl>())
+NEFlattenLayer::NEFlattenLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&) = default;
+NEFlattenLayer::NEFlattenLayer(NEFlattenLayer &&)            = default;
 NEFlattenLayer &NEFlattenLayer::operator=(NEFlattenLayer &&) = default;
 NEFlattenLayer::~NEFlattenLayer()                            = default;
 
@@ -51,7 +51,8 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     _impl->src = input;
     _impl->dst = output;
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input->info())));
+    auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(
+                                            misc::shape_calculator::compute_flatten_shape(input->info())));
 
     _impl->op = std::make_unique<cpu::CpuFlatten>();
     _impl->op->configure(_impl->src->info(), _impl->dst->info());
@@ -60,9 +61,10 @@
 Status NEFlattenLayer::validate(const ITensorInfo *input, const ITensorInfo *output)
 {
     // Checks performed when output is configured
-    if(output->total_size() != 0)
+    if (output->total_size() != 0)
     {
-        const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
+        const TensorInfo tensor_info_output =
+            input->clone()->set_tensor_shape(misc::shape_calculator::compute_flatten_shape(input));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
     }
     return cpu::CpuFlatten::validate(input, output);

diff --git a/src/runtime/NEON/functions/NEFloor.cpp b/src/runtime/NEON/functions/NEFloor.cpp
index d2dc48a..112c93c 100644
--- a/src/runtime/NEON/functions/NEFloor.cpp
+++ b/src/runtime/NEON/functions/NEFloor.cpp

@@ -24,22 +24,22 @@
 #include "arm_compute/runtime/NEON/functions/NEFloor.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuFloor.h"
 
 namespace arm_compute
 {
 struct NEFloor::Impl
 {
-    const ITensor                 *src{ nullptr };
-    ITensor                       *dst{ nullptr };
-    std::unique_ptr<cpu::CpuFloor> op{ nullptr };
+    const ITensor                 *src{nullptr};
+    ITensor                       *dst{nullptr};
+    std::unique_ptr<cpu::CpuFloor> op{nullptr};
 };
 
-NEFloor::NEFloor()
-    : _impl(std::make_unique<Impl>())
+NEFloor::NEFloor() : _impl(std::make_unique<Impl>())
 {
 }
-NEFloor::NEFloor(NEFloor &&) = default;
+NEFloor::NEFloor(NEFloor &&)            = default;
 NEFloor &NEFloor::operator=(NEFloor &&) = default;
 NEFloor::~NEFloor()                     = default;
 

diff --git a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
index 891487e..2656d0f 100644
--- a/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp
+++ b/src/runtime/NEON/functions/NEFullyConnectedLayer.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/functions/NEConvertFullyConnectedWeights.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuFullyConnected.h"
@@ -38,80 +39,90 @@
 struct NEFullyConnectedLayer::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<cpu::CpuFullyConnected> op{ nullptr };
+    std::unique_ptr<cpu::CpuFullyConnected> op{nullptr};
 
-    const ITensor *original_weights{ nullptr };
+    const ITensor *original_weights{nullptr};
 
     ITensorPack                      run_pack{};
     WorkspaceData<Tensor>            workspace{};
     experimental::MemoryRequirements aux_mem_req{};
 
-    bool is_prepared{ false };
-    bool dynamic_weights{ false };
+    bool is_prepared{false};
+    bool dynamic_weights{false};
 };
 
 NEFullyConnectedLayer::~NEFullyConnectedLayer() = default;
 
-NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+NEFullyConnectedLayer::NEFullyConnectedLayer(std::shared_ptr<IMemoryManager> memory_manager,
+                                             IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group    = MemoryGroup(std::move(memory_manager));
     _impl->weights_manager = weights_manager;
 }
 
-void NEFullyConnectedLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output,
-                                      FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+void NEFullyConnectedLayer::configure(const ITensor          *input,
+                                      const ITensor          *weights,
+                                      const ITensor          *biases,
+                                      ITensor                *output,
+                                      FullyConnectedLayerInfo fc_info,
+                                      const WeightsInfo      &weights_info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(),
-                                                               weights->info(),
+    ARM_COMPUTE_ERROR_THROW_ON(NEFullyConnectedLayer::validate(input->info(), weights->info(),
                                                                biases != nullptr ? biases->info() : nullptr,
-                                                               output->info(),
-                                                               fc_info,
-                                                               weights_info));
+                                                               output->info(), fc_info, weights_info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, fc_info);
 
     _impl->op               = std::make_unique<cpu::CpuFullyConnected>();
     _impl->original_weights = weights;
     _impl->is_prepared      = false;
 
-    _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), fc_info, weights_info);
+    _impl->op->configure(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(),
+                         fc_info, weights_info);
 
-    if(_impl->weights_manager != nullptr)
+    if (_impl->weights_manager != nullptr)
     {
         _impl->weights_manager->manage(_impl->original_weights);
     }
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
 
-    _impl->dynamic_weights =
-        !weights->info()->are_values_constant() &&
-        fc_info.transpose_weights &&
-        !fc_info.are_weights_reshaped &&
-        !fc_info.retain_internal_weights;
+    _impl->dynamic_weights = !weights->info()->are_values_constant() && fc_info.transpose_weights &&
+                             !fc_info.are_weights_reshaped && !fc_info.retain_internal_weights;
 }
 
-Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *input, const ITensorInfo *weights,
-                                           const ITensorInfo *biases, const ITensorInfo *output, const FullyConnectedLayerInfo &fc_info,
-                                           const WeightsInfo &weights_info)
+Status NEFullyConnectedLayer::has_opt_impl(arm_compute::WeightFormat     &expected_weight_format,
+                                           const ITensorInfo             *input,
+                                           const ITensorInfo             *weights,
+                                           const ITensorInfo             *biases,
+                                           const ITensorInfo             *output,
+                                           const FullyConnectedLayerInfo &fc_info,
+                                           const WeightsInfo             &weights_info)
 {
-    return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info, weights_info);
+    return cpu::CpuFullyConnected::has_opt_impl(expected_weight_format, input, weights, biases, output, fc_info,
+                                                weights_info);
 }
 
-Status NEFullyConnectedLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
-                                       FullyConnectedLayerInfo fc_info, const WeightsInfo &weights_info)
+Status NEFullyConnectedLayer::validate(const ITensorInfo      *input,
+                                       const ITensorInfo      *weights,
+                                       const ITensorInfo      *biases,
+                                       const ITensorInfo      *output,
+                                       FullyConnectedLayerInfo fc_info,
+                                       const WeightsInfo      &weights_info)
 {
     return cpu::CpuFullyConnected::validate(input, weights, biases, output, fc_info, weights_info);
 }
 
 void NEFullyConnectedLayer::run()
 {
-    if(!_impl->dynamic_weights)
+    if (!_impl->dynamic_weights)
     {
         prepare();
     }
@@ -122,7 +133,7 @@
 
 void NEFullyConnectedLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 
@@ -131,13 +142,13 @@
         _impl->is_prepared = true;
 
         // Handle weights managed infrastructure
-        if(_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
+        if (_impl->weights_manager != nullptr && _impl->weights_manager->are_weights_managed(_impl->original_weights))
         {
             // Ensure that b gets marked as unused (memory released) only after the last function which uses b also finishes its prepare
             // This is for cases where multiple functions share the same b (weights)
             // Therefore when a function marks original b as unused, we pre-mark it in weights manager, and mark it back to used so that it doesn't get released before its last reference
             const ITensor *original_b = _impl->original_weights;
-            if(!original_b->is_used())
+            if (!original_b->is_used())
             {
                 _impl->weights_manager->pre_mark_as_unused(original_b);
             }

diff --git a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
index 6612845..f5b8b57 100644
--- a/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp
+++ b/src/runtime/NEON/functions/NEFuseBatchNormalization.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEFuseBatchNormalizationKernel.h"
 
@@ -35,29 +36,42 @@
 {
 NEFuseBatchNormalization::~NEFuseBatchNormalization() = default;
 
-NEFuseBatchNormalization::NEFuseBatchNormalization()
-    : _fuse_bn_kernel()
+NEFuseBatchNormalization::NEFuseBatchNormalization() : _fuse_bn_kernel()
 {
 }
 
-void NEFuseBatchNormalization::configure(const ITensor *input_weights, const ITensor *bn_mean, const ITensor *bn_var,
-                                         ITensor *fused_weights, ITensor *fused_bias,
-                                         const ITensor *input_bias, const ITensor *bn_beta, const ITensor *bn_gamma,
-                                         float epsilon, FuseBatchNormalizationType fbn_type)
+void NEFuseBatchNormalization::configure(const ITensor             *input_weights,
+                                         const ITensor             *bn_mean,
+                                         const ITensor             *bn_var,
+                                         ITensor                   *fused_weights,
+                                         ITensor                   *fused_bias,
+                                         const ITensor             *input_bias,
+                                         const ITensor             *bn_beta,
+                                         const ITensor             *bn_gamma,
+                                         float                      epsilon,
+                                         FuseBatchNormalizationType fbn_type)
 {
-    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias,
-                           bn_beta, bn_gamma, epsilon, fbn_type);
+    ARM_COMPUTE_LOG_PARAMS(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                           epsilon, fbn_type);
 
     _fuse_bn_kernel = std::make_unique<NEFuseBatchNormalizationKernel>();
-    _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    _fuse_bn_kernel->configure(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma,
+                               epsilon, fbn_type);
 }
 
-Status NEFuseBatchNormalization::validate(const ITensorInfo *input_weights, const ITensorInfo *bn_mean, const ITensorInfo *bn_var,
-                                          const ITensorInfo *fused_weights, const ITensorInfo *fused_bias,
-                                          const ITensorInfo *input_bias, const ITensorInfo *bn_beta, const ITensorInfo *bn_gamma,
-                                          float epsilon, FuseBatchNormalizationType fbn_type)
+Status NEFuseBatchNormalization::validate(const ITensorInfo         *input_weights,
+                                          const ITensorInfo         *bn_mean,
+                                          const ITensorInfo         *bn_var,
+                                          const ITensorInfo         *fused_weights,
+                                          const ITensorInfo         *fused_bias,
+                                          const ITensorInfo         *input_bias,
+                                          const ITensorInfo         *bn_beta,
+                                          const ITensorInfo         *bn_gamma,
+                                          float                      epsilon,
+                                          FuseBatchNormalizationType fbn_type)
 {
-    return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias, input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
+    return NEFuseBatchNormalizationKernel::validate(input_weights, bn_mean, bn_var, fused_weights, fused_bias,
+                                                    input_bias, bn_beta, bn_gamma, epsilon, fbn_type);
 }
 
 void NEFuseBatchNormalization::run()

diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index e51f2f9..934a825 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemm.h"
@@ -39,12 +40,12 @@
 struct NEGEMM::Impl
 {
     MemoryGroup      memory_group{};
-    IWeightsManager *weights_manager{ nullptr };
+    IWeightsManager *weights_manager{nullptr};
 
-    std::unique_ptr<cpu::CpuGemm> op{ nullptr };
+    std::unique_ptr<cpu::CpuGemm> op{nullptr};
 
-    const ITensor *original_b{ nullptr };
-    bool           is_prepared{ false };
+    const ITensor *original_b{nullptr};
+    bool           is_prepared{false};
 
     ITensorPack                      run_pack{};
     ITensorPack                      prep_pack{};
@@ -61,10 +62,17 @@
 
 NEGEMM::~NEGEMM() = default;
 
-void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info)
+void NEGEMM::configure(const ITensor  *a,
+                       const ITensor  *b,
+                       const ITensor  *c,
+                       ITensor        *d,
+                       float           alpha,
+                       float           beta,
+                       const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info));
+    ARM_COMPUTE_ERROR_THROW_ON(cpu::CpuGemm::validate(a->info(), b->info(), (c != nullptr) ? c->info() : nullptr,
+                                                      d->info(), alpha, beta, gemm_info));
 
     // Check if we need to reshape the matrix B only on the first run
     _impl->is_prepared = false;
@@ -73,24 +81,32 @@
 
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->info()->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_info_to_use->set_are_values_constant(false);
     }
 
-    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta, gemm_info);
+    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr) ? c->info() : nullptr, d->info(), alpha, beta,
+                         gemm_info);
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { ACL_SRC_0, a }, { ACL_SRC_1, b }, { ACL_SRC_2, c }, { ACL_DST, d } };
-    _impl->prep_pack   = { { ACL_SRC_1, b }, { ACL_SRC_2, c } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack    = {{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_SRC_2, c}, {ACL_DST, d}};
+    _impl->prep_pack   = {{ACL_SRC_1, b}, {ACL_SRC_2, c}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMM::validate(const ITensorInfo *a,
+                        const ITensorInfo *b,
+                        const ITensorInfo *c,
+                        const ITensorInfo *output,
+                        float              alpha,
+                        float              beta,
+                        const GEMMInfo    &gemm_info)
 {
     // Make the B matrix dynamic values.
     auto b_to_use = b->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_to_use->set_are_values_constant(false);
     }
@@ -98,8 +114,14 @@
     return cpu::CpuGemm::validate(a, b_to_use.get(), c, output, alpha, beta, gemm_info);
 }
 
-Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output,
-                            float alpha, float beta, const GEMMInfo &gemm_info)
+Status NEGEMM::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                            const ITensorInfo         *a,
+                            const ITensorInfo         *b,
+                            const ITensorInfo         *c,
+                            const ITensorInfo         *output,
+                            float                      alpha,
+                            float                      beta,
+                            const GEMMInfo            &gemm_info)
 {
     ARM_COMPUTE_UNUSED(alpha, beta);
     return cpu::CpuGemm::has_opt_impl(expected_weight_format, a, b, c, output, gemm_info);
@@ -115,15 +137,15 @@
 
 void NEGEMM::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->original_b->mark_as_unused();
         }

diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index 42b8b70..6cca02e 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmDirectConv2d.h"
 
@@ -35,25 +36,25 @@
 
 struct NEGEMMConv2d::Impl
 {
-    const ITensor                   *weights{ nullptr };
-    std::unique_ptr<OperatorType>    op{ nullptr };
+    const ITensor                   *weights{nullptr};
+    std::unique_ptr<OperatorType>    op{nullptr};
     ITensorPack                      run_pack{};
     ITensorPack                      prep_pack{};
     WorkspaceData<Tensor>            workspace{};
     MemoryGroup                      memory_group{};
-    bool                             is_prepared{ false };
+    bool                             is_prepared{false};
     experimental::MemoryRequirements aux_mem_req{};
 };
 
-NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(memory_manager);
 }
 
 NEGEMMConv2d::~NEGEMMConv2d() = default;
 
-void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
+void NEGEMMConv2d::configure(
+    ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
@@ -61,15 +62,21 @@
     _impl->is_prepared = false;
     _impl->op          = std::make_unique<OperatorType>();
 
-    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), info);
+    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+                         info);
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { TensorType::ACL_SRC_0, input }, { TensorType::ACL_SRC_2, biases }, { TensorType::ACL_DST, output } };
-    _impl->prep_pack   = { { TensorType::ACL_SRC_1, weights }, { TensorType::ACL_SRC_2, biases } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack  = {{TensorType::ACL_SRC_0, input}, {TensorType::ACL_SRC_2, biases}, {TensorType::ACL_DST, output}};
+    _impl->prep_pack = {{TensorType::ACL_SRC_1, weights}, {TensorType::ACL_SRC_2, biases}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
+Status NEGEMMConv2d::validate(const ITensorInfo *input,
+                              const ITensorInfo *weights,
+                              const ITensorInfo *biases,
+                              const ITensorInfo *output,
+                              const Conv2dInfo  &info)
 {
     return OperatorType::validate(input, weights, biases, output, info);
 }
@@ -84,15 +91,15 @@
 
 void NEGEMMConv2d::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->weights->mark_as_unused();
         }

diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index fe3ea6a..c8f65d2 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmConv2d.h"
 
@@ -36,17 +37,18 @@
 {
 struct NEGEMMConvolutionLayer::Impl
 {
-    const ITensor                      *weights{ nullptr };
-    std::unique_ptr<cpu::CpuGemmConv2d> op{ nullptr };
+    const ITensor                      *weights{nullptr};
+    std::unique_ptr<cpu::CpuGemmConv2d> op{nullptr};
     ITensorPack                         run_pack{};
     MemoryGroup                         memory_group{};
-    IWeightsManager                    *weights_manager{ nullptr };
+    IWeightsManager                    *weights_manager{nullptr};
     MemoryRequirements                  aux_mem_req{};
     WorkspaceData<Tensor>               workspace_tensors{};
-    bool                                is_prepared{ false };
+    bool                                is_prepared{false};
 };
 
-NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
+NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager,
+                                               IWeightsManager                       *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->weights_manager = weights_manager;
@@ -54,37 +56,61 @@
 }
 NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
 
-void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
-                                       const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+void NEGEMMConvolutionLayer::configure(const ITensor             *input,
+                                       const ITensor             *weights,
+                                       const ITensor             *biases,
+                                       ITensor                   *output,
+                                       const PadStrideInfo       &conv_info,
+                                       const WeightsInfo         &weights_info,
+                                       const Size2D              &dilation,
+                                       const ActivationLayerInfo &act_info,
+                                       bool                       enable_fast_math,
+                                       unsigned int               num_groups)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
 
     _impl->weights = weights;
     _impl->op      = std::make_unique<cpu::CpuGemmConv2d>();
-    _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(),
+                         conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, input },
-        { TensorType::ACL_SRC_1, weights },
-        { TensorType::ACL_SRC_2, biases },
-        { TensorType::ACL_DST, output }
-    };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
+    _impl->run_pack    = {{TensorType::ACL_SRC_0, input},
+                          {TensorType::ACL_SRC_1, weights},
+                          {TensorType::ACL_SRC_2, biases},
+                          {TensorType::ACL_DST, output}};
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->run_pack);
 }
 
-Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                        const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math, unsigned int num_groups)
+Status NEGEMMConvolutionLayer::validate(const ITensorInfo         *input,
+                                        const ITensorInfo         *weights,
+                                        const ITensorInfo         *biases,
+                                        const ITensorInfo         *output,
+                                        const PadStrideInfo       &conv_info,
+                                        const WeightsInfo         &weights_info,
+                                        const Size2D              &dilation,
+                                        const ActivationLayerInfo &act_info,
+                                        bool                       enable_fast_math,
+                                        unsigned int               num_groups)
 {
-    return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, enable_fast_math, num_groups);
+    return cpu::CpuGemmConv2d::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info,
+                                        enable_fast_math, num_groups);
 }
 
-Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format, const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
-                                            const PadStrideInfo &conv_info,
-                                            const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, const bool enable_fast_math)
+Status NEGEMMConvolutionLayer::has_opt_impl(arm_compute::WeightFormat &expected_weight_format,
+                                            const ITensorInfo         *src,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *dst,
+                                            const PadStrideInfo       &conv_info,
+                                            const WeightsInfo         &weights_info,
+                                            const Size2D              &dilation,
+                                            const ActivationLayerInfo &act_info,
+                                            const bool                 enable_fast_math)
 {
-    return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info, dilation, act_info, enable_fast_math);
+    return cpu::CpuGemmConv2d::has_opt_impl(expected_weight_format, src, weights, biases, dst, conv_info, weights_info,
+                                            dilation, act_info, enable_fast_math);
 }
 
 void NEGEMMConvolutionLayer::run()
@@ -96,7 +122,7 @@
 
 void NEGEMMConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->run_pack);
 

diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 453d3ced..44bfc6a 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp

@@ -29,8 +29,8 @@
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
-#include "src/core/helpers/MemoryHelpers.h"
 
+#include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
 
 using namespace arm_compute::experimental;
@@ -39,18 +39,19 @@
 {
 struct NEGEMMLowpMatrixMultiplyCore::Impl
 {
-    const ITensor                                      *b{ nullptr };
-    std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{ nullptr };
+    const ITensor                                      *b{nullptr};
+    std::unique_ptr<cpu::CpuGemmLowpMatrixMultiplyCore> op{nullptr};
     ITensorPack                                         run_pack{};
     ITensorPack                                         prep_pack{};
     MemoryGroup                                         memory_group{};
-    IWeightsManager                                    *weights_manager{ nullptr };
+    IWeightsManager                                    *weights_manager{nullptr};
     MemoryRequirements                                  aux_mem_req{};
     WorkspaceData<Tensor>                               workspace_tensors{};
-    bool                                                is_prepared{ false };
+    bool                                                is_prepared{false};
 };
 
-NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager, IWeightsManager *weights_manager)
+NEGEMMLowpMatrixMultiplyCore::NEGEMMLowpMatrixMultiplyCore(std::shared_ptr<IMemoryManager> memory_manager,
+                                                           IWeightsManager                *weights_manager)
     : _impl(std::make_unique<Impl>())
 {
     _impl->weights_manager = weights_manager;
@@ -58,41 +59,41 @@
 }
 NEGEMMLowpMatrixMultiplyCore::~NEGEMMLowpMatrixMultiplyCore() = default;
 
-void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
+void NEGEMMLowpMatrixMultiplyCore::configure(
+    const ITensor *a, const ITensor *b, const ITensor *c, ITensor *output, const GEMMInfo &gemm_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output);
 
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->info()->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_info_to_use->set_are_values_constant(false);
     }
 
     _impl->b  = b;
     _impl->op = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();
-    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(), gemm_info);
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC_0, a },
-        { TensorType::ACL_SRC_1, b },
-        { TensorType::ACL_SRC_2, c },
-        { TensorType::ACL_DST, output }
-    };
-    _impl->prep_pack =
-    {
-        { TensorType::ACL_SRC_1, b },
-        { TensorType::ACL_SRC_2, c }
-    };
-    _impl->aux_mem_req       = _impl->op->workspace();
-    _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->op->configure(a->info(), b_info_to_use.get(), (c != nullptr ? c->info() : nullptr), output->info(),
+                         gemm_info);
+    _impl->run_pack    = {{TensorType::ACL_SRC_0, a},
+                          {TensorType::ACL_SRC_1, b},
+                          {TensorType::ACL_SRC_2, c},
+                          {TensorType::ACL_DST, output}};
+    _impl->prep_pack   = {{TensorType::ACL_SRC_1, b}, {TensorType::ACL_SRC_2, c}};
+    _impl->aux_mem_req = _impl->op->workspace();
+    _impl->workspace_tensors =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
-Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info)
+Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a,
+                                              const ITensorInfo *b,
+                                              const ITensorInfo *c,
+                                              const ITensorInfo *output,
+                                              const GEMMInfo    &gemm_info)
 {
     // Make the B matrix dynamic values.
     auto b_info_to_use = b->clone();
-    if(!gemm_info.reshape_b_only_on_first_run())
+    if (!gemm_info.reshape_b_only_on_first_run())
     {
         b_info_to_use->set_are_values_constant(false);
     }
@@ -109,15 +110,15 @@
 
 void NEGEMMLowpMatrixMultiplyCore::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
 
-        auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
-                                        _impl->aux_mem_req.end(),
-                                        [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
+        auto has_reshape =
+            std::find_if(_impl->aux_mem_req.begin(), _impl->aux_mem_req.end(),
+                         [](const MemoryInfo &m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
 
-        if(has_reshape != std::end(_impl->aux_mem_req))
+        if (has_reshape != std::end(_impl->aux_mem_req))
         {
             _impl->b->mark_as_unused();
         }

diff --git a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
index 7e1de3c..8178003 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpOutputStage.cpp

@@ -25,45 +25,48 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuGemmLowpOutputStage.h"
 
 namespace arm_compute
 {
 struct NEGEMMLowpOutputStage::Impl
 {
-    const ITensor                               *src{ nullptr };
-    const ITensor                               *bias{ nullptr };
-    ITensor                                     *dst{ nullptr };
+    const ITensor                               *src{nullptr};
+    const ITensor                               *bias{nullptr};
+    ITensor                                     *dst{nullptr};
     ITensorPack                                  run_pack{};
-    std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{ nullptr };
+    std::unique_ptr<cpu::CpuGemmLowpOutputStage> op{nullptr};
 };
 
-NEGEMMLowpOutputStage::NEGEMMLowpOutputStage()
-    : _impl(std::make_unique<Impl>())
+NEGEMMLowpOutputStage::NEGEMMLowpOutputStage() : _impl(std::make_unique<Impl>())
 {
 }
 NEGEMMLowpOutputStage::~NEGEMMLowpOutputStage() = default;
 
-void NEGEMMLowpOutputStage::configure(const ITensor *input, const ITensor *bias, ITensor *output, const GEMMLowpOutputStageInfo &info)
+void NEGEMMLowpOutputStage::configure(const ITensor                 *input,
+                                      const ITensor                 *bias,
+                                      ITensor                       *output,
+                                      const GEMMLowpOutputStageInfo &info)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(
+        NEGEMMLowpOutputStage::validate(input->info(), bias != nullptr ? bias->info() : nullptr, output->info(), info));
     _impl->src  = input;
     _impl->bias = bias;
     _impl->dst  = output;
     _impl->op   = std::make_unique<cpu::CpuGemmLowpOutputStage>();
     _impl->op->configure(input->info(), (bias == nullptr) ? nullptr : bias->info(), output->info(), info);
 
-    _impl->run_pack =
-    {
-        { TensorType::ACL_SRC, _impl->src },
-        { TensorType::ACL_BIAS, _impl->bias },
-        { TensorType::ACL_DST, _impl->dst }
-    };
+    _impl->run_pack = {
+        {TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_BIAS, _impl->bias}, {TensorType::ACL_DST, _impl->dst}};
 }
 
-Status NEGEMMLowpOutputStage::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo &info)
+Status NEGEMMLowpOutputStage::validate(const ITensorInfo             *input,
+                                       const ITensorInfo             *bias,
+                                       const ITensorInfo             *output,
+                                       const GEMMLowpOutputStageInfo &info)
 {
     return cpu::CpuGemmLowpOutputStage::validate(input, bias, output, info);
 }

diff --git a/src/runtime/NEON/functions/NEGather.cpp b/src/runtime/NEON/functions/NEGather.cpp
index f5d19c7..62b8cfa 100644
--- a/src/runtime/NEON/functions/NEGather.cpp
+++ b/src/runtime/NEON/functions/NEGather.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEGather.h"
 
-#include "src/core/NEON/kernels/NEGatherKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEGatherKernel.h"
 
 #include <utility>
 

diff --git a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
index 1c0e736..1022b41 100644
--- a/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp
+++ b/src/runtime/NEON/functions/NEGenerateProposalsLayer.cpp

@@ -25,11 +25,12 @@
 
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/NEON/kernels/NEFillBorderKernel.h"
 #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
 #include "src/core/NEON/kernels/NEPadLayerKernel.h"
-#include "src/core/helpers/AutoConfiguration.h"
 
 namespace arm_compute
 {
@@ -68,42 +69,55 @@
 
 NEGenerateProposalsLayer::~NEGenerateProposalsLayer() = default;
 
-void NEGenerateProposalsLayer::configure(const ITensor *scores, const ITensor *deltas, const ITensor *anchors, ITensor *proposals, ITensor *scores_out, ITensor *num_valid_proposals,
+void NEGenerateProposalsLayer::configure(const ITensor               *scores,
+                                         const ITensor               *deltas,
+                                         const ITensor               *anchors,
+                                         ITensor                     *proposals,
+                                         ITensor                     *scores_out,
+                                         ITensor                     *num_valid_proposals,
                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(), proposals->info(), scores_out->info(), num_valid_proposals->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(NEGenerateProposalsLayer::validate(scores->info(), deltas->info(), anchors->info(),
+                                                                  proposals->info(), scores_out->info(),
+                                                                  num_valid_proposals->info(), info));
     ARM_COMPUTE_LOG_PARAMS(scores, deltas, anchors, proposals, scores_out, num_valid_proposals, info);
 
     _is_nhwc                        = scores->info()->data_layout() == DataLayout::NHWC;
     const DataType scores_data_type = scores->info()->data_type();
     _is_qasymm8                     = scores_data_type == DataType::QASYMM8;
-    const int    num_anchors        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
-    const int    feat_width         = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
-    const int    feat_height        = scores->info()->dimension(get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
-    const int    total_num_anchors  = num_anchors * feat_width * feat_height;
-    const int    pre_nms_topN       = info.pre_nms_topN();
-    const int    post_nms_topN      = info.post_nms_topN();
-    const size_t values_per_roi     = info.values_per_roi();
+    const int num_anchors           = scores->info()->dimension(
+                  get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height = scores->info()->dimension(
+        get_data_layout_dimension_index(scores->info()->data_layout(), DataLayoutDimension::HEIGHT));
+    const int    total_num_anchors = num_anchors * feat_width * feat_height;
+    const int    pre_nms_topN      = info.pre_nms_topN();
+    const int    post_nms_topN     = info.post_nms_topN();
+    const size_t values_per_roi    = info.values_per_roi();
 
     const QuantizationInfo scores_qinfo   = scores->info()->quantization_info();
     const DataType         rois_data_type = (_is_qasymm8) ? DataType::QASYMM16 : scores_data_type;
-    const QuantizationInfo rois_qinfo     = (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
+    const QuantizationInfo rois_qinfo =
+        (_is_qasymm8) ? QuantizationInfo(0.125f, 0) : scores->info()->quantization_info();
 
     // Compute all the anchors
     _memory_group.manage(&_all_anchors);
     _compute_anchors = std::make_unique<NEComputeAllAnchorsKernel>();
-    _compute_anchors->configure(anchors, &_all_anchors, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
+    _compute_anchors->configure(anchors, &_all_anchors,
+                                ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale()));
 
     const TensorShape flatten_shape_deltas(values_per_roi, total_num_anchors);
-    _deltas_flattened.allocator()->init(TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
+    _deltas_flattened.allocator()->init(
+        TensorInfo(flatten_shape_deltas, 1, scores_data_type, deltas->info()->quantization_info()));
 
     // Permute and reshape deltas
     _memory_group.manage(&_deltas_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_deltas_permuted);
-        _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_deltas.configure(deltas, &_deltas_permuted, PermutationVector{2, 0, 1});
         _flatten_deltas.configure(&_deltas_permuted, &_deltas_flattened);
         _deltas_permuted.allocator()->allocate();
     }
@@ -117,10 +131,10 @@
 
     // Permute and reshape scores
     _memory_group.manage(&_scores_flattened);
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _memory_group.manage(&_scores_permuted);
-        _permute_scores.configure(scores, &_scores_permuted, PermutationVector{ 2, 0, 1 });
+        _permute_scores.configure(scores, &_scores_permuted, PermutationVector{2, 0, 1});
         _flatten_scores.configure(&_scores_permuted, &_scores_flattened);
         _scores_permuted.allocator()->allocate();
     }
@@ -131,7 +145,7 @@
 
     Tensor *anchors_to_use = &_all_anchors;
     Tensor *deltas_to_use  = &_deltas_flattened;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _all_anchors_f32.allocator()->init(TensorInfo(_all_anchors.info()->tensor_shape(), 1, DataType::F32));
         _deltas_flattened_f32.allocator()->init(TensorInfo(_deltas_flattened.info()->tensor_shape(), 1, DataType::F32));
@@ -154,11 +168,12 @@
     anchors_to_use->allocator()->allocate();
 
     _all_proposals_to_use = &_all_proposals;
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _memory_group.manage(&_all_proposals_quantized);
         // Requantize all_proposals to QASYMM16 with 0.125 scale and 0 offset
-        _all_proposals_quantized.allocator()->init(TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
+        _all_proposals_quantized.allocator()->init(
+            TensorInfo(_all_proposals.info()->tensor_shape(), 1, DataType::QASYMM16, QuantizationInfo(0.125f, 0)));
         _quantize_all_proposals.configure(&_all_proposals, &_all_proposals_quantized);
         _all_proposals.allocator()->allocate();
         _all_proposals_to_use = &_all_proposals_quantized;
@@ -174,7 +189,8 @@
 
     // Note that NMS needs outputs preinitialized.
     auto_init_if_empty(*scores_out->info(), TensorShape(scores_nms_size), 1, scores_data_type, scores_qinfo);
-    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type, rois_qinfo);
+    auto_init_if_empty(*_proposals_4_roi_values.info(), TensorShape(values_per_roi, scores_nms_size), 1, rois_data_type,
+                       rois_qinfo);
     auto_init_if_empty(*num_valid_proposals->info(), TensorShape(1), 1, DataType::U32);
 
     // Initialize temporaries (unused) outputs
@@ -187,17 +203,12 @@
 
     _memory_group.manage(&_proposals_4_roi_values);
 
-    const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f, true, min_size_scaled, info.im_width(), info.im_height());
-    _cpp_nms.configure(&_scores_flattened /*scores_in*/,
-                       _all_proposals_to_use /*boxes_in,*/,
-                       nullptr /* batch_splits_in*/,
-                       scores_out /* scores_out*/,
-                       &_proposals_4_roi_values /*boxes_out*/,
-                       &_classes_nms_unused /*classes*/,
-                       nullptr /*batch_splits_out*/,
-                       &_keeps_nms_unused /*keeps*/,
-                       num_valid_proposals /* keeps_size*/,
-                       box_nms_info);
+    const BoxNMSLimitInfo box_nms_info(0.0f, info.nms_thres(), scores_nms_size, false, NMSType::LINEAR, 0.5f, 0.001f,
+                                       true, min_size_scaled, info.im_width(), info.im_height());
+    _cpp_nms.configure(&_scores_flattened /*scores_in*/, _all_proposals_to_use /*boxes_in,*/,
+                       nullptr /* batch_splits_in*/, scores_out /* scores_out*/, &_proposals_4_roi_values /*boxes_out*/,
+                       &_classes_nms_unused /*classes*/, nullptr /*batch_splits_out*/, &_keeps_nms_unused /*keeps*/,
+                       num_valid_proposals /* keeps_size*/, box_nms_info);
 
     _keeps_nms_unused.allocator()->allocate();
     _classes_nms_unused.allocator()->allocate();
@@ -205,12 +216,17 @@
     _scores_flattened.allocator()->allocate();
 
     // Add the first column that represents the batch id. This will be all zeros, as we don't support multiple images
-    _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{ { 1, 0 } });
+    _pad.configure(&_proposals_4_roi_values, proposals, PaddingList{{1, 0}});
     _proposals_4_roi_values.allocator()->allocate();
 }
 
-Status NEGenerateProposalsLayer::validate(const ITensorInfo *scores, const ITensorInfo *deltas, const ITensorInfo *anchors, const ITensorInfo *proposals, const ITensorInfo *scores_out,
-                                          const ITensorInfo *num_valid_proposals, const GenerateProposalsInfo &info)
+Status NEGenerateProposalsLayer::validate(const ITensorInfo           *scores,
+                                          const ITensorInfo           *deltas,
+                                          const ITensorInfo           *anchors,
+                                          const ITensorInfo           *proposals,
+                                          const ITensorInfo           *scores_out,
+                                          const ITensorInfo           *num_valid_proposals,
+                                          const GenerateProposalsInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(scores, deltas, anchors, proposals, scores_out, num_valid_proposals);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(scores, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
@@ -218,9 +234,12 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(scores, deltas);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(scores, deltas);
 
-    const int num_anchors       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
-    const int feat_width        = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
-    const int feat_height       = scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
+    const int num_anchors =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::CHANNEL));
+    const int feat_width =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::WIDTH));
+    const int feat_height =
+        scores->dimension(get_data_layout_dimension_index(scores->data_layout(), DataLayoutDimension::HEIGHT));
     const int num_images        = scores->dimension(3);
     const int total_num_anchors = num_anchors * feat_width * feat_height;
     const int values_per_roi    = info.values_per_roi();
@@ -229,76 +248,100 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON(num_images > 1);
 
-    if(is_qasymm8)
+    if (is_qasymm8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(anchors, 1, DataType::QSYMM16);
         const UniformQuantizationInfo anchors_qinfo = anchors->quantization_info().uniform();
         ARM_COMPUTE_RETURN_ERROR_ON(anchors_qinfo.scale != 0.125f);
     }
 
-    TensorInfo all_anchors_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
+    TensorInfo all_anchors_info(
+        anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEComputeAllAnchorsKernel::validate(
+        anchors, &all_anchors_info, ComputeAnchorsInfo(feat_width, feat_height, info.spatial_scale())));
 
-    TensorInfo deltas_permuted_info = deltas->clone()->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    TensorInfo scores_permuted_info = scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
-    if(scores->data_layout() == DataLayout::NHWC)
+    TensorInfo deltas_permuted_info =
+        deltas->clone()
+            ->set_tensor_shape(TensorShape(values_per_roi * num_anchors, feat_width, feat_height))
+            .set_is_resizable(true);
+    TensorInfo scores_permuted_info =
+        scores->clone()->set_tensor_shape(TensorShape(num_anchors, feat_width, feat_height)).set_is_resizable(true);
+    if (scores->data_layout() == DataLayout::NHWC)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(deltas, &deltas_permuted_info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(scores, &scores_permuted_info);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{ 2, 0, 1 }));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{ 2, 0, 1 }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(deltas, &deltas_permuted_info, PermutationVector{2, 0, 1}));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPermute::validate(scores, &scores_permuted_info, PermutationVector{2, 0, 1}));
     }
 
-    TensorInfo deltas_flattened_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo deltas_flattened_info(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
     ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&deltas_permuted_info, &deltas_flattened_info));
 
-    TensorInfo scores_flattened_info(scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
-    TensorInfo proposals_4_roi_values(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    TensorInfo scores_flattened_info(
+        scores->clone()->set_tensor_shape(TensorShape(1, total_num_anchors)).set_is_resizable(true));
+    TensorInfo proposals_4_roi_values(
+        deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(&scores_permuted_info, &scores_flattened_info));
 
     TensorInfo *proposals_4_roi_values_to_use = &proposals_4_roi_values;
-    TensorInfo  proposals_4_roi_values_quantized(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
-    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16).set_quantization_info(QuantizationInfo(0.125f, 0));
-    if(is_qasymm8)
+    TensorInfo  proposals_4_roi_values_quantized(
+         deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true));
+    proposals_4_roi_values_quantized.set_data_type(DataType::QASYMM16)
+        .set_quantization_info(QuantizationInfo(0.125f, 0));
+    if (is_qasymm8)
     {
-        TensorInfo all_anchors_f32_info(anchors->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
+        TensorInfo all_anchors_f32_info(anchors->clone()
+                                            ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                            .set_is_resizable(true)
+                                            .set_data_type(DataType::F32));
         ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&all_anchors_info, &all_anchors_f32_info));
 
-        TensorInfo deltas_flattened_f32_info(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
+        TensorInfo deltas_flattened_f32_info(deltas->clone()
+                                                 ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                 .set_is_resizable(true)
+                                                 .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEDequantizationLayer::validate(&deltas_flattened_info, &deltas_flattened_f32_info));
 
-        TensorInfo proposals_4_roi_values_f32(deltas->clone()->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors)).set_is_resizable(true).set_data_type(DataType::F32));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
-                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        TensorInfo proposals_4_roi_values_f32(deltas->clone()
+                                                  ->set_tensor_shape(TensorShape(values_per_roi, total_num_anchors))
+                                                  .set_is_resizable(true)
+                                                  .set_data_type(DataType::F32));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(
+            &all_anchors_f32_info, &proposals_4_roi_values_f32, &deltas_flattened_f32_info,
+            BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEQuantizationLayer::validate(&proposals_4_roi_values_f32, &proposals_4_roi_values_quantized));
         proposals_4_roi_values_to_use = &proposals_4_roi_values_quantized;
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
-                                                                     BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEBoundingBoxTransform::validate(&all_anchors_info, &proposals_4_roi_values, &deltas_flattened_info,
+                                             BoundingBoxTransformInfo(info.im_width(), info.im_height(), 1.f)));
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{ { 1, 0 } }));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPadLayer::validate(proposals_4_roi_values_to_use, proposals, PaddingList{{1, 0}}));
 
-    if(num_valid_proposals->total_size() > 0)
+    if (num_valid_proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(num_valid_proposals->dimension(0) > 1);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(num_valid_proposals, 1, DataType::U32);
     }
 
-    if(proposals->total_size() > 0)
+    if (proposals->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(0) != size_t(values_per_roi) + 1);
         ARM_COMPUTE_RETURN_ERROR_ON(proposals->dimension(1) != size_t(total_num_anchors));
-        if(is_qasymm8)
+        if (is_qasymm8)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(proposals, 1, DataType::QASYMM16);
             const UniformQuantizationInfo proposals_qinfo = proposals->quantization_info().uniform();
@@ -311,7 +354,7 @@
         }
     }
 
-    if(scores_out->total_size() > 0)
+    if (scores_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(scores_out->dimension(0) != size_t(total_num_anchors));
@@ -330,7 +373,7 @@
     NEScheduler::get().schedule(_compute_anchors.get(), Window::DimY);
 
     // Transpose and reshape the inputs
-    if(!_is_nhwc)
+    if (!_is_nhwc)
     {
         _permute_deltas.run();
         _permute_scores.run();
@@ -339,7 +382,7 @@
     _flatten_deltas.run();
     _flatten_scores.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _dequantize_anchors.run();
         _dequantize_deltas.run();
@@ -348,7 +391,7 @@
     // Build the boxes
     _bounding_box.run();
 
-    if(_is_qasymm8)
+    if (_is_qasymm8)
     {
         _quantize_all_proposals.run();
     }

diff --git a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
index 822dcf4..78218cb 100644
--- a/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEInstanceNormalizationLayer.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 
@@ -34,7 +35,13 @@
 NEInstanceNormalizationLayer::~NEInstanceNormalizationLayer() = default;
 
 NEInstanceNormalizationLayer::NEInstanceNormalizationLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _normalization_kernel(), _is_nchw(false), _permute_input(), _permute_output(), _permuted_input(), _permuted_output()
+    : _memory_group(std::move(memory_manager)),
+      _normalization_kernel(),
+      _is_nchw(false),
+      _permute_input(),
+      _permute_output(),
+      _permuted_input(),
+      _permuted_output()
 {
 }
 
@@ -43,14 +50,14 @@
     ARM_COMPUTE_LOG_PARAMS(input, output, gamma, beta, epsilon);
 
     const DataLayout data_layout       = input->info()->data_layout();
-    const auto       kernel_descriptor = InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true };
+    const auto       kernel_descriptor = InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true};
 
     // Configure Kernels
     _is_nchw = data_layout == DataLayout::NCHW;
 
     _normalization_kernel = std::make_unique<NEInstanceNormalizationLayerKernel>();
 
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _memory_group.manage(&_permuted_input);
         _memory_group.manage(&_permuted_output);
@@ -72,11 +79,12 @@
     }
 }
 
-Status NEInstanceNormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
+Status NEInstanceNormalizationLayer::validate(
+    const ITensorInfo *input, const ITensorInfo *output, float gamma, float beta, float epsilon)
 {
-    return NEInstanceNormalizationLayerKernel::validate(&input->clone()->set_data_layout(DataLayout::NCHW),
-                                                        &output->clone()->set_data_layout(DataLayout::NCHW),
-                                                        InstanceNormalizationLayerKernelInfo{ gamma, beta, epsilon, true });
+    return NEInstanceNormalizationLayerKernel::validate(
+        &input->clone()->set_data_layout(DataLayout::NCHW), &output->clone()->set_data_layout(DataLayout::NCHW),
+        InstanceNormalizationLayerKernelInfo{gamma, beta, epsilon, true});
 }
 
 void NEInstanceNormalizationLayer::run()
@@ -84,7 +92,7 @@
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     // Permute input
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _permute_input.run();
     }
@@ -92,7 +100,7 @@
     NEScheduler::get().schedule(_normalization_kernel.get(), Window::DimZ);
 
     // Permute output
-    if(!_is_nchw)
+    if (!_is_nchw)
     {
         _permute_output.run();
     }

diff --git a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
index c3ecfb4..b7f6203 100644
--- a/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp
+++ b/src/runtime/NEON/functions/NEL2NormalizeLayer.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "src/core/NEON/kernels/NEReductionOperationKernel.h"
@@ -69,7 +70,8 @@
     sum_sq.set_tensor_shape(shape);
 
     const uint32_t actual_axis = wrap_around(axis, max_input_tensor_dim);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEReductionOperation::validate(input, &sum_sq, actual_axis, ReductionOperation::SUM_SQUARE));
 
     // Reduce shape on axis
     shape.set(actual_axis, 1);

diff --git a/src/runtime/NEON/functions/NELSTMLayer.cpp b/src/runtime/NEON/functions/NELSTMLayer.cpp
index 428cdf8..1a08cde 100644
--- a/src/runtime/NEON/functions/NELSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayer.cpp

@@ -24,11 +24,12 @@
 #include "arm_compute/runtime/NEON/functions/NELSTMLayer.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/common/LSTMParams.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -39,42 +40,122 @@
 NELSTMLayer::~NELSTMLayer() = default;
 
 NELSTMLayer::NELSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _fully_connected_input_gate(), _accum_input_gate1(), _subtract_input_gate(), _pixelwise_mul_input_gate(), _activation_input_gate(),
-      _fully_connected_forget_gate(), _accum_forget_gate1(), _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _transpose_cell_state(),
-      _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(),
-      _pixelwise_mul_output_state1(), _accum_output1(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _projection_clip(),
-      _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
-      _mean_std_norm_input_gate(), _pixelwise_mul_input_gate_coeff(), _accum_input_gate_bias(), _mean_std_norm_forget_gate(), _pixelwise_mul_forget_gate_coeff(), _accum_forget_gate_bias(),
-      _mean_std_norm_cell_gate(), _pixelwise_mul_cell_gate_coeff(), _accum_cell_gate_bias(), _mean_std_norm_output_gate(), _pixelwise_mul_output_gate_coeff(), _accum_output_gate_bias(), _input_gate_out1(),
-      _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _forget_gate_out6(),
-      _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), _output_state1(), _ones(),
-      _input_layer_norm_out1(), _input_layer_norm_out2(), _forget_layer_norm_out1(), _forget_layer_norm_out2(), _cell_layer_norm_out1(), _cell_layer_norm_out2(), _output_layer_norm_out1(),
-      _output_layer_norm_out2(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false),
+    : _memory_group(std::move(memory_manager)),
+      _fully_connected_input_gate(),
+      _accum_input_gate1(),
+      _subtract_input_gate(),
+      _pixelwise_mul_input_gate(),
+      _activation_input_gate(),
+      _fully_connected_forget_gate(),
+      _accum_forget_gate1(),
+      _pixelwise_mul_forget_gate(),
+      _activation_forget_gate(),
+      _fully_connected_cell_state(),
+      _gemm_cell_state1(),
+      _transpose_cell_state(),
+      _accum_cell_state1(),
+      _accum_cell_state2(),
+      _pixelwise_mul_cell_state1(),
+      _activation_cell_state(),
+      _cell_clip(),
+      _pixelwise_mul_cell_state2(),
+      _fully_connected_output(),
+      _pixelwise_mul_output_state1(),
+      _accum_output1(),
+      _activation_output(),
+      _activation_output_state(),
+      _pixelwise_mul_output_state2(),
+      _fully_connected_output_state(),
+      _projection_clip(),
+      _copy_cell_state(),
+      _copy_output(),
+      _concat_scratch_buffer(),
+      _concat_inputs_forget_gate(),
+      _concat_weights_forget_gate(),
+      _concat_weights_input_gate(),
+      _concat_weights_output(),
+      _mean_std_norm_input_gate(),
+      _pixelwise_mul_input_gate_coeff(),
+      _accum_input_gate_bias(),
+      _mean_std_norm_forget_gate(),
+      _pixelwise_mul_forget_gate_coeff(),
+      _accum_forget_gate_bias(),
+      _mean_std_norm_cell_gate(),
+      _pixelwise_mul_cell_gate_coeff(),
+      _accum_cell_gate_bias(),
+      _mean_std_norm_output_gate(),
+      _pixelwise_mul_output_gate_coeff(),
+      _accum_output_gate_bias(),
+      _input_gate_out1(),
+      _input_gate_out2(),
+      _input_gate_out3(),
+      _input_gate_out4(),
+      _forget_gate_out1(),
+      _forget_gate_out2(),
+      _forget_gate_out3(),
+      _forget_gate_out4(),
+      _forget_gate_out5(),
+      _forget_gate_out6(),
+      _cell_state_out1(),
+      _cell_state_out2(),
+      _cell_state_out3(),
+      _cell_state_out4(),
+      _cell_state_out5(),
+      _output1(),
+      _output2(),
+      _output3(),
+      _output4(),
+      _cell_state_activation(),
+      _output_state1(),
+      _ones(),
+      _input_layer_norm_out1(),
+      _input_layer_norm_out2(),
+      _forget_layer_norm_out1(),
+      _forget_layer_norm_out2(),
+      _cell_layer_norm_out1(),
+      _cell_layer_norm_out2(),
+      _output_layer_norm_out1(),
+      _output_layer_norm_out2(),
+      _run_peephole_opt(false),
+      _run_cifg_opt(false),
+      _perform_cell_clipping(false),
+      _has_projection_weights(false),
+      _perform_projection_clipping(false),
+      _is_prepared(false),
       _is_layer_norm_lstm(false)
 {
 }
 
-void NELSTMLayer::configure(const ITensor *input,
-                            const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                            const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                            const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                            const ITensor *output_state_in, const ITensor *cell_state_in,
-                            ITensor *scratch_buffer, ITensor *output_state_out, ITensor *cell_state_out, ITensor *output,
-                            const LSTMParams<ITensor> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+void NELSTMLayer::configure(const ITensor             *input,
+                            const ITensor             *input_to_forget_weights,
+                            const ITensor             *input_to_cell_weights,
+                            const ITensor             *input_to_output_weights,
+                            const ITensor             *recurrent_to_forget_weights,
+                            const ITensor             *recurrent_to_cell_weights,
+                            const ITensor             *recurrent_to_output_weights,
+                            const ITensor             *forget_gate_bias,
+                            const ITensor             *cell_bias,
+                            const ITensor             *output_gate_bias,
+                            const ITensor             *output_state_in,
+                            const ITensor             *cell_state_in,
+                            ITensor                   *scratch_buffer,
+                            ITensor                   *output_state_out,
+                            ITensor                   *cell_state_out,
+                            ITensor                   *output,
+                            const LSTMParams<ITensor> &lstm_params,
+                            const ActivationLayerInfo &activation_info,
+                            float                      cell_threshold,
+                            float                      projection_threshold)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input,
-                                 input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias,
-                                 output_state_in, cell_state_in,
+                                 forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
                                  scratch_buffer, output_state_out, cell_state_out, output);
-    ARM_COMPUTE_LOG_PARAMS(input,
-                           input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                            recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                           forget_gate_bias, cell_bias, output_gate_bias,
-                           output_state_in, cell_state_in,
-                           scratch_buffer, output_state_out, cell_state_out, output,
-                           lstm_params, activation_info, cell_threshold, projection_threshold);
+                           forget_gate_bias, cell_bias, output_gate_bias, output_state_in, cell_state_in,
+                           scratch_buffer, output_state_out, cell_state_out, output, lstm_params, activation_info,
+                           cell_threshold, projection_threshold);
 
     _is_layer_norm_lstm = lstm_params.use_layer_norm();
 
@@ -83,13 +164,12 @@
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
     // Validate
-    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(input->info(), input_to_forget_weights->info(),
-                                                     input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                     recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                     forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                     output_state_in->info(), cell_state_in->info(),
-                                                     scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
-                                                     lstm_params_info, activation_info, cell_threshold, projection_threshold));
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayer::validate(
+        input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
+        recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), output_state_in->info(),
+        cell_state_in->info(), scratch_buffer->info(), output_state_out->info(), cell_state_out->info(), output->info(),
+        lstm_params_info, activation_info, cell_threshold, projection_threshold));
 
     const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape();
 
@@ -116,20 +196,23 @@
     _concat_weights_forget_gate.configure(weights_vector, &_forget_gate_out6, Window::DimX);
 
     _memory_group.manage(&_forget_gate_out5);
-    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
+    _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6,
+                                           (_is_layer_norm_lstm) ? nullptr : forget_gate_bias, &_forget_gate_out5);
     _memory_group.manage(&_forget_gate_out1);
     _memory_group.manage(&_forget_gate_out3);
     _forget_gate_out6.allocator()->allocate();
 
     Tensor *forget_gate_out = &_forget_gate_out5;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _forget_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
         _run_peephole_opt = true;
         _memory_group.manage(&_forget_gate_out4);
-        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3, ConvertPolicy::SATURATE);
+        _pixelwise_mul_forget_gate.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_forget_gate_out4, 1,
+                                             ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _accum_forget_gate1.configure(&_forget_gate_out5, &_forget_gate_out4, &_forget_gate_out3,
+                                      ConvertPolicy::SATURATE);
         _forget_gate_out4.allocator()->allocate();
         _forget_gate_out5.allocator()->allocate();
         forget_gate_out = &_forget_gate_out3;
@@ -138,21 +221,25 @@
     {
         _forget_gate_out3.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _forget_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _forget_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_forget_layer_norm_out1);
         _memory_group.manage(&_forget_layer_norm_out2);
         _mean_std_norm_forget_gate.configure(forget_gate_out);
-        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(), &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_forget_gate_coeff.configure(forget_gate_out, lstm_params.forget_layer_norm_weights(),
+                                                   &_forget_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
         // forget_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         forget_gate_out->allocator()->allocate();
-        _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_forget_gate_bias.configure(&_forget_layer_norm_out1, forget_gate_bias, &_forget_layer_norm_out2,
+                                          ConvertPolicy::SATURATE);
         _forget_layer_norm_out1.allocator()->allocate();
         forget_gate_out = &_forget_layer_norm_out2;
     }
-    _activation_forget_gate.configure(forget_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_forget_gate.configure(forget_gate_out, nullptr,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the input gate
     // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
@@ -161,7 +248,7 @@
     // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG
     _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
     Tensor *input_gate_out = &_input_gate_out1;
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
         _memory_group.manage(&_input_gate_out1);
         _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
@@ -183,15 +270,19 @@
         _memory_group.manage(&_input_gate_out1);
         _memory_group.manage(&_input_gate_out4);
 
-        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(), &_input_gate_out3);
+        _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2,
+                                              (_is_layer_norm_lstm) ? nullptr : lstm_params.input_gate_bias(),
+                                              &_input_gate_out3);
         _input_gate_out2.allocator()->allocate();
         input_gate_out = &_input_gate_out3;
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _memory_group.manage(&_input_gate_out4);
-            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE);
+            _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _accum_input_gate1.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1,
+                                         ConvertPolicy::SATURATE);
             _input_gate_out3.allocator()->allocate();
             _input_gate_out4.allocator()->allocate();
             input_gate_out = &_input_gate_out1;
@@ -201,21 +292,25 @@
             _input_gate_out1.allocator()->allocate();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _input_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _input_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
             _memory_group.manage(&_input_layer_norm_out1);
             _memory_group.manage(&_input_layer_norm_out2);
             _mean_std_norm_input_gate.configure(input_gate_out);
-            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(), &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+            _pixelwise_mul_input_gate_coeff.configure(input_gate_out, lstm_params.input_layer_norm_weights(),
+                                                      &_input_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                      RoundingPolicy::TO_ZERO);
             // input_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
             input_gate_out->allocator()->allocate();
-            _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(), &_input_layer_norm_out2, ConvertPolicy::SATURATE);
+            _accum_input_gate_bias.configure(&_input_layer_norm_out1, lstm_params.input_gate_bias(),
+                                             &_input_layer_norm_out2, ConvertPolicy::SATURATE);
             _input_layer_norm_out1.allocator()->allocate();
             input_gate_out = &_input_layer_norm_out2;
         }
-        _activation_input_gate.configure(input_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _activation_input_gate.configure(input_gate_out, nullptr,
+                                         ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     }
 
     // Configure block that calculates the cell state
@@ -228,7 +323,8 @@
     _cell_state_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
 
     _memory_group.manage(&_cell_state_out1);
-    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias, &_cell_state_out1);
+    _fully_connected_cell_state.configure(input, input_to_cell_weights, (_is_layer_norm_lstm) ? nullptr : cell_bias,
+                                          &_cell_state_out1);
     _memory_group.manage(&_cell_state_out2);
     _transpose_cell_state.configure(recurrent_to_cell_weights, &_cell_state_out2);
     _memory_group.manage(&_cell_state_out3);
@@ -237,33 +333,40 @@
     _memory_group.manage(&_cell_state_out4);
     _accum_cell_state1.configure(&_cell_state_out1, &_cell_state_out3, &_cell_state_out4, ConvertPolicy::SATURATE);
     Tensor *cell_state_out_ptr = &_cell_state_out4;
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _cell_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _cell_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_cell_layer_norm_out1);
         _memory_group.manage(&_cell_layer_norm_out2);
         _mean_std_norm_cell_gate.configure(cell_state_out_ptr);
-        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(), &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_cell_gate_coeff.configure(cell_state_out_ptr, lstm_params.cell_layer_norm_weights(),
+                                                 &_cell_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                 RoundingPolicy::TO_ZERO);
         // cell_state_out_ptr is going to be reassigned, so allocate the tensor that it was assigned to before
         cell_state_out_ptr->allocator()->allocate();
-        _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_cell_gate_bias.configure(&_cell_layer_norm_out1, cell_bias, &_cell_layer_norm_out2,
+                                        ConvertPolicy::SATURATE);
         _cell_layer_norm_out1.allocator()->allocate();
         cell_state_out_ptr = &_cell_layer_norm_out2;
     }
     _activation_cell_state.configure(cell_state_out_ptr, nullptr, activation_info);
     _memory_group.manage(&_cell_state_out5);
-    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_cell_state1.configure(cell_state_out_ptr, input_gate_out, &_cell_state_out5, 1,
+                                         ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     cell_state_out_ptr->allocator()->allocate();
-    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_cell_state2.configure(forget_gate_out, cell_state_in, &_cell_state_out3, 1, ConvertPolicy::SATURATE,
+                                         RoundingPolicy::TO_ZERO);
     _accum_cell_state2.configure(&_cell_state_out5, &_cell_state_out3, &_cell_state_out1, ConvertPolicy::SATURATE);
     _cell_state_out3.allocator()->allocate();
     _cell_state_out5.allocator()->allocate();
     // Perform clipping
-    if(cell_threshold != 0.f)
+    if (cell_threshold != 0.f)
     {
         _perform_cell_clipping = true;
-        _cell_clip.configure(&_cell_state_out1, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold, -cell_threshold));
+        _cell_clip.configure(&_cell_state_out1, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 cell_threshold, -cell_threshold));
     }
 
     // Configure block that calculates the output
@@ -281,18 +384,20 @@
     _memory_group.manage(&_output1);
     _memory_group.manage(&_output4);
 
-    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias, &_output4);
+    _fully_connected_output.configure(&_forget_gate_out2, &_output2, (_is_layer_norm_lstm) ? nullptr : output_gate_bias,
+                                      &_output4);
 
     _output2.allocator()->allocate();
     _forget_gate_out2.allocator()->allocate();
 
     Tensor *output_gate_out = &_output4;
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type()));
 
         _memory_group.manage(&_output3);
-        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1,
+                                               ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
         _accum_output1.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE);
         _output4.allocator()->allocate();
         output_gate_out = &_output1;
@@ -304,21 +409,25 @@
     {
         _output1.allocator()->allocate();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _output_layer_norm_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _output_layer_norm_out2.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
         _memory_group.manage(&_output_layer_norm_out1);
         _memory_group.manage(&_output_layer_norm_out2);
         _mean_std_norm_output_gate.configure(output_gate_out);
-        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(), &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_output_gate_coeff.configure(output_gate_out, lstm_params.output_layer_norm_weights(),
+                                                   &_output_layer_norm_out1, 1, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
         // output_gate_out is going to be reassigned, so allocate the tensor that it was assigned to before
         output_gate_out->allocator()->allocate();
-        _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2, ConvertPolicy::SATURATE);
+        _accum_output_gate_bias.configure(&_output_layer_norm_out1, output_gate_bias, &_output_layer_norm_out2,
+                                          ConvertPolicy::SATURATE);
         _output_layer_norm_out1.allocator()->allocate();
         output_gate_out = &_output_layer_norm_out2;
     }
-    _activation_output.configure(output_gate_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _activation_output.configure(output_gate_out, nullptr,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
 
     // Configure block that calculates the output state
     /** lstm_res = PixelwiseMul(output, Activation(cell_state))
@@ -335,20 +444,24 @@
 
     _memory_group.manage(&_cell_state_activation);
     _activation_output_state.configure(&_cell_state_out1, &_cell_state_activation, activation_info);
-    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_output_state2.configure(&_cell_state_activation, output_gate_out, output_state_out_tmp, 1,
+                                           ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
     _cell_state_activation.allocator()->allocate();
     output_gate_out->allocator()->allocate();
 
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
         _has_projection_weights = true;
-        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out);
+        _fully_connected_output_state.configure(output_state_out_tmp, lstm_params.projection_weights(),
+                                                lstm_params.projection_bias(), output_state_out);
         _output_state1.allocator()->allocate();
         // Perform clipping
-        if(projection_threshold != 0.f)
+        if (projection_threshold != 0.f)
         {
             _perform_projection_clipping = true;
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold));
+            _projection_clip.configure(output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -projection_threshold, projection_threshold));
         }
     }
 
@@ -358,7 +471,7 @@
 
     // Vector for holding the tensors to store in scratch buffer
     std::vector<const ITensor *> scratch_inputs;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         scratch_inputs.emplace_back(input_gate_out);
     }
@@ -372,29 +485,38 @@
     output_gate_out->allocator()->allocate();
 }
 
-Status NELSTMLayer::validate(const ITensorInfo *input,
-                             const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                             const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                             const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                             const ITensorInfo *output_state_in, const ITensorInfo *cell_state_in,
-                             const ITensorInfo *scratch_buffer, const ITensorInfo *output_state_out, const ITensorInfo *cell_state_out, const ITensorInfo *output,
-                             const LSTMParams<ITensorInfo> &lstm_params, const ActivationLayerInfo &activation_info, float cell_threshold, float projection_threshold)
+Status NELSTMLayer::validate(const ITensorInfo             *input,
+                             const ITensorInfo             *input_to_forget_weights,
+                             const ITensorInfo             *input_to_cell_weights,
+                             const ITensorInfo             *input_to_output_weights,
+                             const ITensorInfo             *recurrent_to_forget_weights,
+                             const ITensorInfo             *recurrent_to_cell_weights,
+                             const ITensorInfo             *recurrent_to_output_weights,
+                             const ITensorInfo             *forget_gate_bias,
+                             const ITensorInfo             *cell_bias,
+                             const ITensorInfo             *output_gate_bias,
+                             const ITensorInfo             *output_state_in,
+                             const ITensorInfo             *cell_state_in,
+                             const ITensorInfo             *scratch_buffer,
+                             const ITensorInfo             *output_state_out,
+                             const ITensorInfo             *cell_state_out,
+                             const ITensorInfo             *output,
+                             const LSTMParams<ITensorInfo> &lstm_params,
+                             const ActivationLayerInfo     &activation_info,
+                             float                          cell_threshold,
+                             float                          projection_threshold)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input,
-                                        input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                        forget_gate_bias, cell_bias, output_gate_bias,
-                                        output_state_in, cell_state_in,
-                                        scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check data types
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input,
-                                                       input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                       recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                                       forget_gate_bias, cell_bias, output_gate_bias,
-                                                       output_state_in, cell_state_in,
-                                                       scratch_buffer, output_state_out, cell_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(
+        input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights,
+        recurrent_to_cell_weights, recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+        output_state_in, cell_state_in, scratch_buffer, output_state_out, cell_state_out, output);
 
     // Check dimensions
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 2);
@@ -413,16 +535,16 @@
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(cell_state_out->num_dimensions() > 2);
     ARM_COMPUTE_RETURN_ERROR_ON(output->num_dimensions() > 2);
-    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0)
-                                && cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
+    ARM_COMPUTE_RETURN_ERROR_ON(cell_bias->dimension(0) * 4 != scratch_buffer->dimension(0) &&
+                                cell_bias->dimension(0) * 3 != scratch_buffer->dimension(0));
 
     const unsigned int num_batches = input->dimension(1);
     const unsigned int num_cells   = input_to_output_weights->dimension(1);
 
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         // If CIFG is used, input layer normalization weights tensor is omitted
-        if(lstm_params.has_cifg_opt())
+        if (lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_layer_norm_weights() != nullptr);
         }
@@ -434,8 +556,12 @@
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.input_layer_norm_weights());
         }
 
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(), lstm_params.cell_layer_norm_weights(), lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.forget_layer_norm_weights(),
+                                            lstm_params.cell_layer_norm_weights(),
+                                            lstm_params.output_layer_norm_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, lstm_params.forget_layer_norm_weights(),
+                                                           lstm_params.cell_layer_norm_weights(),
+                                                           lstm_params.output_layer_norm_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_layer_norm_weights()->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_layer_norm_weights()->num_dimensions() > 1);
@@ -445,7 +571,7 @@
     }
 
     // Check peephole optimization
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_output_weights(), lstm_params.cell_to_forget_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() > 1);
@@ -465,33 +591,39 @@
     std::vector<const ITensorInfo *> inputs_vector;
     inputs_vector.emplace_back(input);
     inputs_vector.emplace_back(output_state_in);
-    const TensorShape concat_shape       = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
+    const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
     TensorInfo        forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(inputs_vector, &forget_gate_concat, Window::DimX));
 
     // Validate forget gate
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_forget_weights, (lstm_params.use_layer_norm()) ? nullptr : forget_gate_bias, &forget_gate));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&forget_gate));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&forget_gate, lstm_params.forget_layer_norm_weights(), &forget_gate, 1,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&forget_gate, forget_gate_bias, &forget_gate, ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &forget_gate, &forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate input gate
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
-                                            lstm_params.recurrent_to_input_weights(),
-                                            lstm_params.input_gate_bias());
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1);
@@ -499,88 +631,120 @@
         std::vector<const ITensorInfo *> lstm_weights;
         lstm_weights.emplace_back(lstm_params.input_to_input_weights());
         lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
-        TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
-        TensorInfo  lstm_gate_concat          = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
+        TensorShape lstm_weights_concat_shape =
+            arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
+        TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
         ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(lstm_weights, &lstm_gate_concat, Window::DimX));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+            input, lstm_params.input_to_input_weights(),
+            (lstm_params.use_layer_norm()) ? nullptr : lstm_params.input_gate_bias(), &input_gate));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
             ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_input_weights()->num_dimensions() > 1);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE));
         }
 
-        if(lstm_params.use_layer_norm())
+        if (lstm_params.use_layer_norm())
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&input_gate));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(), &input_gate, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(&input_gate, lstm_params.input_layer_norm_weights(), &input_gate, 1,
+                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_gate, lstm_params.input_gate_bias(),
+                                                                       &input_gate, ConvertPolicy::SATURATE));
         }
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+            &input_gate, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticSubtraction::validate(&forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE));
     }
 
     // Validate cell state
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(lstm_params.use_layer_norm())
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_cell_weights, (lstm_params.use_layer_norm()) ? nullptr : cell_bias, &cell_state_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &cell_state_tmp, 1.f, 0.f, GEMMInfo()));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&cell_state_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_layer_norm_weights(), &cell_state_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEArithmeticAddition::validate(&cell_state_tmp, cell_bias, &cell_state_tmp, ConvertPolicy::SATURATE));
     }
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
-    if(cell_threshold != 0.f)
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &input_gate, &cell_state_tmp, 1,
+                                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &forget_gate, &cell_state_tmp, 1,
+                                                                    ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp, &cell_state_tmp, &cell_state_tmp, ConvertPolicy::SATURATE));
+    if (cell_threshold != 0.f)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, cell_threshold,
-                                                                                                              -cell_threshold)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(&cell_state_tmp, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            cell_threshold, -cell_threshold)));
     }
 
     // Validate output gate tmp
     std::vector<const ITensorInfo *> in_out_weights;
     in_out_weights.emplace_back(input_to_output_weights);
     in_out_weights.emplace_back(recurrent_to_output_weights);
-    TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
-    TensorInfo  in_out_gate_concat          = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
+    TensorShape in_out_weights_concat_shape =
+        arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
+    TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(in_out_weights, &in_out_gate_concat, Window::DimX));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(
+        input, input_to_output_weights, (lstm_params.use_layer_norm()) ? nullptr : output_gate_bias, &output_gate_tmp));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp,
+                                                1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    if(lstm_params.use_layer_norm())
+    if (lstm_params.use_layer_norm())
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEMeanStdDevNormalizationLayer::validate(&output_gate_tmp));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(&output_gate_tmp, lstm_params.output_layer_norm_weights(),
+                                                &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_gate_tmp, output_gate_bias, &output_gate_tmp,
+                                                                   ConvertPolicy::SATURATE));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &output_gate_tmp, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Validate output state
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_state_tmp, &cell_state_tmp, activation_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &cell_state_tmp, &output_gate_tmp, &output_gate_tmp, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(), lstm_params.projection_bias(), output_state_out));
-        if(projection_threshold != 0.f)
+        ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(&output_gate_tmp, lstm_params.projection_weights(),
+                                                                    lstm_params.projection_bias(), output_state_out));
+        if (projection_threshold != 0.f)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, output_state_out,
-                                                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold, projection_threshold)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+                output_state_out, output_state_out,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -projection_threshold,
+                                    projection_threshold)));
         }
     }
 
@@ -590,7 +754,7 @@
 
     // Validate scratch concatenation
     std::vector<const ITensorInfo *> inputs_vector_info_raw;
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
         inputs_vector_info_raw.push_back(&input_gate);
     }
@@ -611,12 +775,12 @@
     _concat_inputs_forget_gate.run();
     _fully_connected_forget_gate.run();
 
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_forget_gate.run();
         _accum_forget_gate1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_forget_gate.run();
         _pixelwise_mul_forget_gate_coeff.run();
@@ -624,15 +788,17 @@
     }
     _activation_forget_gate.run();
 
-    if(_run_cifg_opt)
+    if (_run_cifg_opt)
     {
-        if(_ones.info()->data_type() == DataType::F16)
+        if (_ones.info()->data_type() == DataType::F16)
         {
-            std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+            std::fill_n(reinterpret_cast<half *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 1);
         }
         else
         {
-            std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
+            std::fill_n(reinterpret_cast<float *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 1);
         }
         _subtract_input_gate.run();
     }
@@ -640,13 +806,13 @@
     {
         _fully_connected_input_gate.run();
 
-        if(_run_peephole_opt)
+        if (_run_peephole_opt)
         {
             _pixelwise_mul_input_gate.run();
             _accum_input_gate1.run();
         }
 
-        if(_is_layer_norm_lstm)
+        if (_is_layer_norm_lstm)
         {
             _mean_std_norm_input_gate.run();
             _pixelwise_mul_input_gate_coeff.run();
@@ -659,7 +825,7 @@
     _transpose_cell_state.run();
     _gemm_cell_state1.run();
     _accum_cell_state1.run();
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_cell_gate.run();
         _pixelwise_mul_cell_gate_coeff.run();
@@ -671,18 +837,18 @@
     _pixelwise_mul_cell_state2.run();
     _accum_cell_state2.run();
 
-    if(_perform_cell_clipping)
+    if (_perform_cell_clipping)
     {
         _cell_clip.run();
     }
 
     _fully_connected_output.run();
-    if(_run_peephole_opt)
+    if (_run_peephole_opt)
     {
         _pixelwise_mul_output_state1.run();
         _accum_output1.run();
     }
-    if(_is_layer_norm_lstm)
+    if (_is_layer_norm_lstm)
     {
         _mean_std_norm_output_gate.run();
         _pixelwise_mul_output_gate_coeff.run();
@@ -693,10 +859,10 @@
     _activation_output_state.run();
     _pixelwise_mul_output_state2.run();
 
-    if(_has_projection_weights)
+    if (_has_projection_weights)
     {
         _fully_connected_output_state.run();
-        if(_perform_projection_clipping)
+        if (_perform_projection_clipping)
         {
             _projection_clip.run();
         }
@@ -710,10 +876,10 @@
 
 void NELSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _concat_weights_forget_gate.run();
-        if(!_run_cifg_opt)
+        if (!_run_cifg_opt)
         {
             _concat_weights_input_gate.run();
         }

diff --git a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
index cfdeb00..41f9c3d 100644
--- a/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp
+++ b/src/runtime/NEON/functions/NELSTMLayerQuantized.cpp

@@ -24,8 +24,9 @@
 #include "arm_compute/runtime/NEON/functions/NELSTMLayerQuantized.h"
 
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/helpers/AutoConfiguration.h"
 
@@ -46,36 +47,104 @@
 NELSTMLayerQuantized::~NELSTMLayerQuantized() = default;
 
 NELSTMLayerQuantized::NELSTMLayerQuantized(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemmlowp(), _output_stage(), _transpose_weights(), _concat_input_weights(), _concat_recurrent_weights(), _concat_weights(), _concat_inputs(),
-      _concat_bias(), _sigmoid_forget_gate(), _sigmoid_input_gate(), _sigmoid_output_gate(), _tanh_modulation_gate(), _tanh_output_state(), _add1(), _add2(), _mul1(), _mul2(), _mul3(),
-      _slice_input_tensor(), _slice_forget_tensor(), _slice_cell_tensor(), _slice_output_tensor(), _dequantize(), _quantize(), _input_to_input_weights(nullptr), _input_to_forget_weights(nullptr),
-      _input_to_cell_weights(nullptr), _input_to_output_weights(nullptr), _recurrent_to_input_weights(nullptr), _recurrent_to_forget_weights(nullptr), _recurrent_to_cell_weights(nullptr),
-      _recurrent_to_output_weights(nullptr), _input_gate_bias(nullptr), _forget_gate_bias(nullptr), _cell_bias(nullptr), _output_gate_bias(nullptr), _recurrent_weights(), _input_weights(), _weights(),
-      _input(), _weights_transposed(), _output_highp(), _output_lowp(), _bias(), _forget_gate_input(), _input_gate_input(), _output_gate_input(), _input_modulation_gate_input(), _forget_gate_output(),
-      _input_gate_output(), _output_gate_output(), _input_modulation_gate_output(), _cell_state1(), _cell_state2(), _output_state_tmp(), _output_state_out_symm(), _output_state_out_f32(),
+    : _memory_group(std::move(memory_manager)),
+      _gemmlowp(),
+      _output_stage(),
+      _transpose_weights(),
+      _concat_input_weights(),
+      _concat_recurrent_weights(),
+      _concat_weights(),
+      _concat_inputs(),
+      _concat_bias(),
+      _sigmoid_forget_gate(),
+      _sigmoid_input_gate(),
+      _sigmoid_output_gate(),
+      _tanh_modulation_gate(),
+      _tanh_output_state(),
+      _add1(),
+      _add2(),
+      _mul1(),
+      _mul2(),
+      _mul3(),
+      _slice_input_tensor(),
+      _slice_forget_tensor(),
+      _slice_cell_tensor(),
+      _slice_output_tensor(),
+      _dequantize(),
+      _quantize(),
+      _input_to_input_weights(nullptr),
+      _input_to_forget_weights(nullptr),
+      _input_to_cell_weights(nullptr),
+      _input_to_output_weights(nullptr),
+      _recurrent_to_input_weights(nullptr),
+      _recurrent_to_forget_weights(nullptr),
+      _recurrent_to_cell_weights(nullptr),
+      _recurrent_to_output_weights(nullptr),
+      _input_gate_bias(nullptr),
+      _forget_gate_bias(nullptr),
+      _cell_bias(nullptr),
+      _output_gate_bias(nullptr),
+      _recurrent_weights(),
+      _input_weights(),
+      _weights(),
+      _input(),
+      _weights_transposed(),
+      _output_highp(),
+      _output_lowp(),
+      _bias(),
+      _forget_gate_input(),
+      _input_gate_input(),
+      _output_gate_input(),
+      _input_modulation_gate_input(),
+      _forget_gate_output(),
+      _input_gate_output(),
+      _output_gate_output(),
+      _input_modulation_gate_output(),
+      _cell_state1(),
+      _cell_state2(),
+      _output_state_tmp(),
+      _output_state_out_symm(),
+      _output_state_out_f32(),
       _is_prepared(false)
 {
 }
 
 void NELSTMLayerQuantized::configure(const ITensor *input,
-                                     const ITensor *input_to_input_weights, const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                                     const ITensor *recurrent_to_input_weights, const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                                     const ITensor *input_gate_bias, const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                                     ITensor *cell_state_in, const ITensor *output_state_in,
-                                     ITensor *cell_state_out, ITensor *output_state_out)
+                                     const ITensor *input_to_input_weights,
+                                     const ITensor *input_to_forget_weights,
+                                     const ITensor *input_to_cell_weights,
+                                     const ITensor *input_to_output_weights,
+                                     const ITensor *recurrent_to_input_weights,
+                                     const ITensor *recurrent_to_forget_weights,
+                                     const ITensor *recurrent_to_cell_weights,
+                                     const ITensor *recurrent_to_output_weights,
+                                     const ITensor *input_gate_bias,
+                                     const ITensor *forget_gate_bias,
+                                     const ITensor *cell_bias,
+                                     const ITensor *output_gate_bias,
+                                     ITensor       *cell_state_in,
+                                     const ITensor *output_state_in,
+                                     ITensor       *cell_state_out,
+                                     ITensor       *output_state_out)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                 recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                                 input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                                 recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias,
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
 
-    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
-                                                              input_to_output_weights->info(),
-                                                              recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                              input_gate_bias->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info()));
+    ARM_COMPUTE_ERROR_THROW_ON(NELSTMLayerQuantized::validate(
+        input->info(), input_to_input_weights->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+        input_to_output_weights->info(), recurrent_to_input_weights->info(), recurrent_to_forget_weights->info(),
+        recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(), input_gate_bias->info(),
+        forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(), cell_state_in->info(),
+        output_state_in->info(), cell_state_out->info(), output_state_out->info()));
 
-    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                           recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                           input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_LOG_PARAMS(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights,
+                           input_to_output_weights, recurrent_to_input_weights, recurrent_to_forget_weights,
+                           recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias,
+                           cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+                           output_state_out);
 
     const int input_size  = input->info()->dimension(0);
     const int batch_size  = input->info()->dimension(1);
@@ -83,8 +152,10 @@
 
     const QuantizationInfo qweights = input_to_input_weights->info()->quantization_info(); // Weights quantization
 
-    auto_init_if_empty(*cell_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
-    auto_init_if_empty(*output_state_out->info(), TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
+    auto_init_if_empty(*cell_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QSYMM16, qsymm_4));
+    auto_init_if_empty(*output_state_out->info(),
+                       TensorInfo(TensorShape(batch_size, output_size), 1, DataType::QASYMM8, qasymm));
 
     _input_to_input_weights      = input_to_input_weights;
     _input_to_forget_weights     = input_to_forget_weights;
@@ -100,34 +171,41 @@
     _output_gate_bias            = output_gate_bias;
 
     // Weights concatenation
-    std::vector<const ITensor *> inputs_weights_vector{ input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights };
-    std::vector<const ITensor *> recurrent_weights_vector{ recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights };
+    std::vector<const ITensor *> inputs_weights_vector{input_to_input_weights, input_to_forget_weights,
+                                                       input_to_cell_weights, input_to_output_weights};
+    std::vector<const ITensor *> recurrent_weights_vector{recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                          recurrent_to_cell_weights, recurrent_to_output_weights};
 
-    _input_weights.allocator()->init(TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _input_weights.allocator()->init(
+        TensorInfo(TensorShape(input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_input_weights.configure(inputs_weights_vector, &_input_weights, Window::DimY);
 
-    _recurrent_weights.allocator()->init(TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    _recurrent_weights.allocator()->init(
+        TensorInfo(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_recurrent_weights.configure(recurrent_weights_vector, &_recurrent_weights, Window::DimY);
 
-    std::vector<const ITensor *> weights_vector{ &_recurrent_weights, &_input_weights };
-    _weights.allocator()->init(TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
+    std::vector<const ITensor *> weights_vector{&_recurrent_weights, &_input_weights};
+    _weights.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, 4 * output_size), 1, DataType::QASYMM8, qweights));
     _concat_weights.configure(weights_vector, &_weights, Window::DimX);
     _transpose_weights.configure(&_weights, &_weights_transposed);
 
     // Input concatenation
-    std::vector<const ITensor *> input_vector{ input, output_state_in };
+    std::vector<const ITensor *> input_vector{input, output_state_in};
     _memory_group.manage(&_input);
-    _input.allocator()->init(TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
+    _input.allocator()->init(
+        TensorInfo(TensorShape(output_size + input_size, batch_size), 1, DataType::QASYMM8, qasymm));
     _concat_inputs.configure(input_vector, &_input, Window::DimX);
 
     // Bias concatenation
-    std::vector<const ITensor *> bias_vector{ input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias };
+    std::vector<const ITensor *> bias_vector{input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias};
     _bias.allocator()->init(TensorInfo(TensorShape(4 * output_size), 1, DataType::S32));
     _concat_bias.configure(bias_vector, &_bias, Window::DimX);
 
     // Invert the offset for gemmlowp
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, -qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, -qweights.uniform().offset));
 
     // Run gemmlowp
     _memory_group.manage(&_output_highp);
@@ -137,7 +215,8 @@
 
     // Set the offset back
     _input.info()->set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
-    _weights_transposed.info()->set_quantization_info(QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
+    _weights_transposed.info()->set_quantization_info(
+        QuantizationInfo(qweights.uniform().scale, qweights.uniform().offset));
 
     // multiplier = (input_scale * weights_scale) / output_scale (2 ^ (-12))
     _output_lowp.allocator()->init(TensorInfo(_output_highp.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_3));
@@ -159,64 +238,80 @@
     _bias.allocator()->allocate();
 
     // Get the gate tensors
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0, 0 }, { output_size, batch_size });
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0, 0}, {output_size, batch_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size });
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size, 0},
+                                       {2 * output_size, batch_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size });
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size, 0},
+                                     {3 * output_size, batch_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size });
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size, 0},
+                                       {4 * output_size, batch_size});
         _output_lowp.allocator()->allocate();
     }
     else
     {
         _memory_group.manage(&_input_gate_input);
-        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, { 0 }, { output_size });
+        _slice_input_tensor.configure(&_output_lowp, &_input_gate_input, {0}, {output_size});
         _memory_group.manage(&_forget_gate_input);
-        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, { output_size }, { 2 * output_size });
+        _slice_forget_tensor.configure(&_output_lowp, &_forget_gate_input, {output_size}, {2 * output_size});
         _memory_group.manage(&_input_modulation_gate_input);
-        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, { 2 * output_size }, { 3 * output_size });
+        _slice_cell_tensor.configure(&_output_lowp, &_input_modulation_gate_input, {2 * output_size},
+                                     {3 * output_size});
         _memory_group.manage(&_output_gate_input);
-        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, { 3 * output_size }, { 4 * output_size });
+        _slice_output_tensor.configure(&_output_lowp, &_output_gate_input, {3 * output_size}, {4 * output_size});
         _output_lowp.allocator()->allocate();
     }
 
     // Forget gate
     _memory_group.manage(&_forget_gate_output);
-    _forget_gate_output.allocator()->init(TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_output.allocator()->init(
+        TensorInfo(_forget_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_forget_gate.configure(&_forget_gate_input, &_forget_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _forget_gate_input.allocator()->allocate();
 
     // Input gate
     _memory_group.manage(&_input_gate_output);
-    _input_gate_output.allocator()->init(TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _input_gate_output.allocator()->init(
+        TensorInfo(_input_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_input_gate.configure(&_input_gate_input, &_input_gate_output,
+                                  ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _input_gate_input.allocator()->allocate();
 
     // Input modulation gate equation
     _memory_group.manage(&_input_modulation_gate_output);
-    _input_modulation_gate_output.allocator()->init(TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _input_modulation_gate_output.allocator()->init(
+        TensorInfo(_input_modulation_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_modulation_gate.configure(&_input_modulation_gate_input, &_input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
     _input_modulation_gate_input.allocator()->allocate();
 
     // Output gate
     _memory_group.manage(&_output_gate_output);
-    _output_gate_output.allocator()->init(TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_output.allocator()->init(
+        TensorInfo(_output_gate_input.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _sigmoid_output_gate.configure(&_output_gate_input, &_output_gate_output,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     _output_gate_input.allocator()->allocate();
 
     // Long term memory
     _memory_group.manage(&_cell_state1);
-    _cell_state1.allocator()->init(TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state1.allocator()->init(
+        TensorInfo(_forget_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul1.configure(&_forget_gate_output, cell_state_in, &_cell_state1, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _forget_gate_output.allocator()->allocate();
 
     _memory_group.manage(&_cell_state2);
-    _cell_state2.allocator()->init(TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
-    _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _cell_state2.allocator()->init(
+        TensorInfo(_input_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_4));
+    _mul2.configure(&_input_gate_output, &_input_modulation_gate_output, &_cell_state2, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _input_modulation_gate_output.allocator()->allocate();
     _input_gate_output.allocator()->allocate();
 
@@ -226,18 +321,23 @@
 
     // Short term memory
     _memory_group.manage(&_output_state_tmp);
-    _output_state_tmp.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _tanh_output_state.configure(cell_state_out, &_output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
+    _output_state_tmp.allocator()->init(
+        TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _tanh_output_state.configure(cell_state_out, &_output_state_tmp,
+                                 ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f));
 
     _memory_group.manage(&_output_state_out_symm);
-    _output_state_out_symm.allocator()->init(TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
-    _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _output_state_out_symm.allocator()->init(
+        TensorInfo(_output_gate_output.info()->tensor_shape(), 1, DataType::QSYMM16, qsymm_0));
+    _mul3.configure(&_output_state_tmp, &_output_gate_output, &_output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                    RoundingPolicy::TO_ZERO);
     _output_gate_output.allocator()->allocate();
     _output_state_tmp.allocator()->allocate();
 
     // Requantize the output state from QSYMM16 to QASYMM8
     _memory_group.manage(&_output_state_out_f32);
-    _output_state_out_f32.allocator()->init(TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
+    _output_state_out_f32.allocator()->init(
+        TensorInfo(_output_state_out_symm.info()->tensor_shape(), 1, DataType::F32));
     _dequantize.configure(&_output_state_out_symm, &_output_state_out_f32);
     _output_state_out_symm.allocator()->allocate();
 
@@ -246,15 +346,28 @@
 }
 
 Status NELSTMLayerQuantized::validate(const ITensorInfo *input,
-                                      const ITensorInfo *input_to_input_weights, const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                                      const ITensorInfo *recurrent_to_input_weights, const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                                      const ITensorInfo *input_gate_bias, const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                                      const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                                      const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out)
+                                      const ITensorInfo *input_to_input_weights,
+                                      const ITensorInfo *input_to_forget_weights,
+                                      const ITensorInfo *input_to_cell_weights,
+                                      const ITensorInfo *input_to_output_weights,
+                                      const ITensorInfo *recurrent_to_input_weights,
+                                      const ITensorInfo *recurrent_to_forget_weights,
+                                      const ITensorInfo *recurrent_to_cell_weights,
+                                      const ITensorInfo *recurrent_to_output_weights,
+                                      const ITensorInfo *input_gate_bias,
+                                      const ITensorInfo *forget_gate_bias,
+                                      const ITensorInfo *cell_bias,
+                                      const ITensorInfo *output_gate_bias,
+                                      const ITensorInfo *cell_state_in,
+                                      const ITensorInfo *output_state_in,
+                                      const ITensorInfo *cell_state_out,
+                                      const ITensorInfo *output_state_out)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_input_weights,
-                                        recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in,
-                                        output_state_in, cell_state_out, output_state_out);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(
+        input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+        recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
+        input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out,
+        output_state_out);
 
     const int input_size  = input->dimension(0);
     const int batch_size  = input->dimension(1);
@@ -266,29 +379,51 @@
     ARM_COMPUTE_RETURN_ERROR_ON(input_gate_bias->num_dimensions() > 1);
     ARM_COMPUTE_RETURN_ERROR_ON(output_state_in->num_dimensions() > 2);
 
-    TensorInfo input_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(input_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo recurrent_weights_info(input_to_input_weights->clone()->set_tensor_shape(TensorShape(output_size, output_size)).set_data_type(DataType::QASYMM8));
-    TensorInfo bias_info(input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
-    TensorInfo output_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QASYMM8).set_quantization_info(qasymm));
-    TensorInfo cell_state_info(cell_state_in->clone()->set_tensor_shape(TensorShape(output_size, batch_size)).set_data_type(DataType::QSYMM16).set_quantization_info(qsymm_4));
+    TensorInfo input_weights_info(input_to_input_weights->clone()
+                                      ->set_tensor_shape(TensorShape(input_size, output_size))
+                                      .set_data_type(DataType::QASYMM8));
+    TensorInfo recurrent_weights_info(input_to_input_weights->clone()
+                                          ->set_tensor_shape(TensorShape(output_size, output_size))
+                                          .set_data_type(DataType::QASYMM8));
+    TensorInfo bias_info(
+        input_gate_bias->clone()->set_tensor_shape(TensorShape(output_size)).set_data_type(DataType::S32));
+    TensorInfo output_state_info(cell_state_in->clone()
+                                     ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                     .set_data_type(DataType::QASYMM8)
+                                     .set_quantization_info(qasymm));
+    TensorInfo cell_state_info(cell_state_in->clone()
+                                   ->set_tensor_shape(TensorShape(output_size, batch_size))
+                                   .set_data_type(DataType::QSYMM16)
+                                   .set_quantization_info(qsymm_4));
 
     // Shape checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&input_weights_info, input_to_input_weights, input_to_forget_weights,
+                                                   input_to_cell_weights, input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&recurrent_weights_info, recurrent_to_input_weights,
+                                                   recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                   recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                   output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_in);
 
     // Data type checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias, output_gate_bias);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&input_weights_info, input, input_to_input_weights,
+                                                       input_to_forget_weights, input_to_cell_weights,
+                                                       input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                       recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&bias_info, input_gate_bias, forget_gate_bias, cell_bias,
+                                                       output_gate_bias);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_in);
 
     // Quantization checks
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights, input_to_forget_weights, input_to_cell_weights, input_to_output_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&input_weights_info, input_to_input_weights,
+                                                              input_to_forget_weights, input_to_cell_weights,
+                                                              input_to_output_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(recurrent_to_input_weights, recurrent_to_forget_weights,
+                                                              recurrent_to_cell_weights, recurrent_to_output_weights);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_in);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&output_state_info, output_state_in);
 
@@ -310,7 +445,8 @@
     recurrent_weights_vector.emplace_back(recurrent_to_cell_weights);
     recurrent_weights_vector.emplace_back(recurrent_to_output_weights);
     const TensorInfo recurrent_weights(TensorShape(output_size, 4 * output_size), 1, DataType::QASYMM8, qweights);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEConcatenateLayer::validate(recurrent_weights_vector, &recurrent_weights, Window::DimY));
 
     // _concat_weights
     std::vector<const ITensorInfo *> weights_vector;
@@ -320,7 +456,7 @@
     ARM_COMPUTE_RETURN_ON_ERROR(NEConcatenateLayer::validate(weights_vector, &weights, Window::DimX));
     // _transpose_weights
     const TensorShape weights_transposed_shape(weights.tensor_shape()[1], weights.tensor_shape()[0]);
-    TensorInfo        weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
+    TensorInfo weights_transposed = weights.clone()->set_is_resizable(true).set_tensor_shape(weights_transposed_shape);
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(&weights, &weights_transposed));
 
     // _concat_inputs
@@ -346,7 +482,8 @@
 
     // _gemmlowp
     const TensorInfo output_highp(TensorShape(4 * output_size, batch_size), 1, DataType::S32);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpMatrixMultiplyCore::validate(&input_concatenated, &weights_transposed, nullptr, &output_highp));
 
     // Set the offset back
     input_concatenated.set_quantization_info(QuantizationInfo(qasymm.uniform().scale, qasymm.uniform().offset));
@@ -357,7 +494,8 @@
     const float multiplier        = 4096.f * qasymm.uniform().scale * qweights.uniform().scale;
     int32_t     output_multiplier = 0;
     int32_t     output_shift      = 0;
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
 
     // _output_stage
     GEMMLowpOutputStageInfo info;
@@ -372,68 +510,91 @@
     TensorInfo input_modulation_gate_input;
     TensorInfo output_gate_input;
 
-    if(batch_size > 1)
+    if (batch_size > 1)
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0, 0 }, { output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &input_gate_input, {0, 0}, {output_size, batch_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size, 0 }, { 2 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &forget_gate_input, {output_size, 0}, {2 * output_size, batch_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size, 0 }, { 3 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size, 0},
+                                                      {3 * output_size, batch_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size, batch_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size, 0 }, { 4 * output_size, batch_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size, 0}, {4 * output_size, batch_size}));
     }
     else
     {
         // _slice_input_tensor
         input_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, { 0 }, { output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_gate_input, {0}, {output_size}));
         // _slice_forget_tensor
         forget_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &forget_gate_input, { output_size }, { 2 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &forget_gate_input, {output_size}, {2 * output_size}));
         // _slice_cell_tensor
         input_modulation_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &input_modulation_gate_input, { 2 * output_size }, { 3 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &input_modulation_gate_input, {2 * output_size}, {3 * output_size}));
         // _slice_output_tensor
         output_gate_input = TensorInfo(TensorShape(output_size), 1, DataType::QSYMM16, qsymm_3);
-        ARM_COMPUTE_RETURN_ON_ERROR(NESlice::validate(&output_lowp, &output_gate_input, { 3 * output_size }, { 4 * output_size }));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NESlice::validate(&output_lowp, &output_gate_input, {3 * output_size}, {4 * output_size}));
     }
 
     // _sigmoid_forget_gate
     const TensorInfo forget_gate_output(forget_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_gate_input, &forget_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&forget_gate_input, &forget_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _sigmoid_input_gate
     const TensorInfo input_gate_output(input_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+        &input_gate_input, &input_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
     // _tanh_modulation_gate
-    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    const TensorInfo input_modulation_gate_output(input_modulation_gate_input.tensor_shape(), 1, DataType::QSYMM16,
+                                                  qsymm_0);
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&input_modulation_gate_input, &input_modulation_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
     // _sigmoid_output_gate
     const TensorInfo output_gate_output(output_gate_input.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_gate_input, &output_gate_output, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&output_gate_input, &output_gate_output,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // _mul_forget_gate_cell_state
     const TensorInfo cell_state_tmp1(forget_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &forget_gate_output, cell_state_in, &cell_state_tmp1, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     // _mul_input_gate_input_mod_gate
     const TensorInfo cell_state_tmp2(input_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_4);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output, &cell_state_tmp2, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_output, &input_modulation_gate_output,
+                                                                    &cell_state_tmp2, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _add_cell_state_tmps
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&cell_state_tmp1, &cell_state_tmp2, cell_state_out, ConvertPolicy::SATURATE));
 
     // _tanh_modulation_gate
     const TensorInfo output_state_tmp(cell_state_out->tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &output_state_tmp, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(cell_state_out, &output_state_tmp,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.0f, 1.0f)));
 
     // _mul_output_state_tmp_output_gate
     const TensorInfo output_state_out_symm(output_gate_output.tensor_shape(), 1, DataType::QSYMM16, qsymm_0);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output, &output_state_out_symm, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_state_tmp, &output_gate_output,
+                                                                    &output_state_out_symm, 1, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     // _dequantize
     const TensorInfo output_state_out_f32(output_state_out_symm.tensor_shape(), 1, DataType::F32);
@@ -442,14 +603,14 @@
     // _quantize
     ARM_COMPUTE_RETURN_ON_ERROR(NEQuantizationLayer::validate(&output_state_out_f32, output_state_out));
 
-    if(cell_state_out->total_size() != 0)
+    if (cell_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&cell_state_info, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(&cell_state_info, cell_state_out);
     }
 
-    if(output_state_out->total_size() != 0)
+    if (output_state_out->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&output_state_info, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&output_state_info, output_state_out);
@@ -508,7 +669,7 @@
 
 void NELSTMLayerQuantized::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _input_weights.allocator()->allocate();
         _concat_input_weights.run();

diff --git a/src/runtime/NEON/functions/NELogical.cpp b/src/runtime/NEON/functions/NELogical.cpp
index 92dcf15..0013a52 100644
--- a/src/runtime/NEON/functions/NELogical.cpp
+++ b/src/runtime/NEON/functions/NELogical.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NELogicalKernel.h"
 
@@ -32,15 +33,14 @@
 {
 struct LogicalArgs
 {
-    std::unique_ptr<kernels::NELogicalKernel> kernel{ nullptr };
+    std::unique_ptr<kernels::NELogicalKernel> kernel{nullptr};
     ITensorPack                               pack{};
 };
 
 struct NELogicalAnd::Impl : public LogicalArgs
 {
 };
-NELogicalAnd::NELogicalAnd()
-    : _impl(std::make_unique<Impl>())
+NELogicalAnd::NELogicalAnd() : _impl(std::make_unique<Impl>())
 {
 }
 NELogicalAnd::~NELogicalAnd() = default;
@@ -72,8 +72,7 @@
 struct NELogicalOr::Impl : public LogicalArgs
 {
 };
-NELogicalOr::NELogicalOr()
-    : _impl(std::make_unique<Impl>())
+NELogicalOr::NELogicalOr() : _impl(std::make_unique<Impl>())
 {
 }
 NELogicalOr::~NELogicalOr() = default;
@@ -105,8 +104,7 @@
 struct NELogicalNot::Impl : public LogicalArgs
 {
 };
-NELogicalNot::NELogicalNot()
-    : _impl(std::make_unique<Impl>())
+NELogicalNot::NELogicalNot() : _impl(std::make_unique<Impl>())
 {
 }
 NELogicalNot::~NELogicalNot() = default;

diff --git a/src/runtime/NEON/functions/NEMatMul.cpp b/src/runtime/NEON/functions/NEMatMul.cpp
index 58640f4..31898ba 100644
--- a/src/runtime/NEON/functions/NEMatMul.cpp
+++ b/src/runtime/NEON/functions/NEMatMul.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuMatMul.h"
 
@@ -33,23 +34,27 @@
 {
 struct NEMatMul::Impl
 {
-    const ITensor                  *lhs{ nullptr };
-    const ITensor                  *rhs{ nullptr };
-    ITensor                        *output{ nullptr };
-    std::unique_ptr<cpu::CpuMatMul> op{ nullptr };
+    const ITensor                  *lhs{nullptr};
+    const ITensor                  *rhs{nullptr};
+    ITensor                        *output{nullptr};
+    std::unique_ptr<cpu::CpuMatMul> op{nullptr};
     MemoryGroup                     memory_group{};
     WorkspaceData<Tensor>           workspace_tensors{};
     ITensorPack                     run_pack{};
 };
 
-NEMatMul::NEMatMul()
-    : _impl(std::make_unique<Impl>())
+NEMatMul::NEMatMul() : _impl(std::make_unique<Impl>())
 {
 }
 
 NEMatMul::~NEMatMul() = default;
 
-void NEMatMul::configure(ITensor *lhs, ITensor *rhs, ITensor *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+void NEMatMul::configure(ITensor                   *lhs,
+                         ITensor                   *rhs,
+                         ITensor                   *output,
+                         const MatMulInfo          &info,
+                         const CpuMatMulSettings   &settings,
+                         const ActivationLayerInfo &act_info)
 {
     _impl->lhs    = lhs;
     _impl->rhs    = rhs;
@@ -58,11 +63,16 @@
     ARM_COMPUTE_ERROR_ON_NULLPTR(_impl->lhs, _impl->rhs, _impl->output);
     _impl->op = std::make_unique<cpu::CpuMatMul>();
     _impl->op->configure(lhs->info(), rhs->info(), output->info(), info, settings, act_info);
-    _impl->run_pack          = { { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs }, { ACL_DST, output } };
+    _impl->run_pack          = {{ACL_SRC_0, lhs}, {ACL_SRC_1, rhs}, {ACL_DST, output}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *output, const MatMulInfo &info, const CpuMatMulSettings &settings, const ActivationLayerInfo &act_info)
+Status NEMatMul::validate(const ITensorInfo         *lhs,
+                          const ITensorInfo         *rhs,
+                          const ITensorInfo         *output,
+                          const MatMulInfo          &info,
+                          const CpuMatMulSettings   &settings,
+                          const ActivationLayerInfo &act_info)
 {
     return cpu::CpuMatMul::validate(lhs, rhs, output, info, settings, act_info);
 }

diff --git a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
index 97ddaea..c3861af 100644
--- a/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEMaxUnpoolingLayer.cpp

@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/kernels/CpuMaxUnpoolingLayerKernel.h"
 #include "src/cpu/operators/CpuMaxUnpooling.h"
@@ -35,20 +36,22 @@
 {
 struct NEMaxUnpoolingLayer::Impl
 {
-    const ITensor                        *src{ nullptr };
-    const ITensor                        *indices{ nullptr };
-    ITensor                              *dst{ nullptr };
-    std::unique_ptr<cpu::CpuMaxUnpooling> op{ nullptr };
+    const ITensor                        *src{nullptr};
+    const ITensor                        *indices{nullptr};
+    ITensor                              *dst{nullptr};
+    std::unique_ptr<cpu::CpuMaxUnpooling> op{nullptr};
 };
 
 NEMaxUnpoolingLayer::~NEMaxUnpoolingLayer() = default;
 
-NEMaxUnpoolingLayer::NEMaxUnpoolingLayer()
-    : _fill_func(), _impl()
+NEMaxUnpoolingLayer::NEMaxUnpoolingLayer() : _fill_func(), _impl()
 {
 }
 
-void NEMaxUnpoolingLayer::configure(ITensor *input, ITensor *indices, ITensor *output, const PoolingLayerInfo &pool_info)
+void NEMaxUnpoolingLayer::configure(ITensor                *input,
+                                    ITensor                *indices,
+                                    ITensor                *output,
+                                    const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, indices, output, pool_info);
 
@@ -64,7 +67,10 @@
     _impl->op->configure(input->info(), indices->info(), output->info(), pool_info);
 }
 
-Status NEMaxUnpoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *indices, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
+Status NEMaxUnpoolingLayer::validate(const ITensorInfo      *input,
+                                     const ITensorInfo      *indices,
+                                     const ITensorInfo      *output,
+                                     const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output, indices);
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuMaxUnpooling::validate(input, indices, output, pool_info));

diff --git a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
index 7626aa0..dec0dde 100644
--- a/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEMeanStdDevNormalizationLayer.h"
 
-#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEMeanStdDevNormalizationKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/runtime/NEON/functions/NENormalizationLayer.cpp b/src/runtime/NEON/functions/NENormalizationLayer.cpp
index d3b1696..d6d2e9d 100644
--- a/src/runtime/NEON/functions/NENormalizationLayer.cpp
+++ b/src/runtime/NEON/functions/NENormalizationLayer.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NENormalizationLayerKernel.h"
 
@@ -61,13 +62,16 @@
     _input_squared.allocator()->allocate();
 }
 
-Status NENormalizationLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const NormalizationLayerInfo &norm_info)
+Status NENormalizationLayer::validate(const ITensorInfo            *input,
+                                      const ITensorInfo            *output,
+                                      const NormalizationLayerInfo &norm_info)
 {
     // Perform validation step
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
 
     ARM_COMPUTE_RETURN_ON_ERROR(NENormalizationLayerKernel::validate(input, input, output, norm_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(input, input, output, 1.0f, ConvertPolicy::SATURATE,
+                                                                    RoundingPolicy::TO_ZERO));
 
     return Status{};
 }
@@ -78,4 +82,4 @@
     _multiply_f.run();
     NEScheduler::get().schedule(_norm_kernel.get(), Window::DimY);
 }
-}
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEPReluLayer.cpp b/src/runtime/NEON/functions/NEPReluLayer.cpp
index 80c5690..963e68b 100644
--- a/src/runtime/NEON/functions/NEPReluLayer.cpp
+++ b/src/runtime/NEON/functions/NEPReluLayer.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPReluLayer.h"
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/operators/CpuPRelu.h"
 
 namespace arm_compute
@@ -32,17 +33,16 @@
 
 struct NEPReluLayer::Impl
 {
-    const ITensor                *src_0{ nullptr };
-    const ITensor                *src_1{ nullptr };
-    ITensor                      *dst{ nullptr };
-    std::unique_ptr<OperatorType> op{ nullptr };
+    const ITensor                *src_0{nullptr};
+    const ITensor                *src_1{nullptr};
+    ITensor                      *dst{nullptr};
+    std::unique_ptr<OperatorType> op{nullptr};
 };
 
-NEPReluLayer::NEPReluLayer()
-    : _impl(std::make_unique<Impl>())
+NEPReluLayer::NEPReluLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEPReluLayer::NEPReluLayer(NEPReluLayer &&) = default;
+NEPReluLayer::NEPReluLayer(NEPReluLayer &&)            = default;
 NEPReluLayer &NEPReluLayer::operator=(NEPReluLayer &&) = default;
 NEPReluLayer::~NEPReluLayer()                          = default;
 

diff --git a/src/runtime/NEON/functions/NEPadLayer.cpp b/src/runtime/NEON/functions/NEPadLayer.cpp
index 8bacdd3..253566d 100644
--- a/src/runtime/NEON/functions/NEPadLayer.cpp
+++ b/src/runtime/NEON/functions/NEPadLayer.cpp

@@ -23,13 +23,13 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEPadLayer.h"
 
-#include "arm_compute/runtime/NEON/NEScheduler.h"
-
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEPadLayerKernel.h"
 
 namespace arm_compute
 {
@@ -38,9 +38,9 @@
 uint32_t last_padding_dimension(const PaddingList &padding)
 {
     int last_padding_dim = padding.size() - 1;
-    for(; last_padding_dim >= 0; --last_padding_dim)
+    for (; last_padding_dim >= 0; --last_padding_dim)
     {
-        if(padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
+        if (padding[last_padding_dim].first > 0 || padding[last_padding_dim].second > 0)
         {
             break;
         }
@@ -52,11 +52,22 @@
 NEPadLayer::~NEPadLayer() = default;
 
 NEPadLayer::NEPadLayer()
-    : _copy_function(), _pad_kernel(), _mode(), _padding(), _num_dimensions(0), _slice_functions(), _concat_functions(), _slice_results(), _concat_results()
+    : _copy_function(),
+      _pad_kernel(),
+      _mode(),
+      _padding(),
+      _num_dimensions(0),
+      _slice_functions(),
+      _concat_functions(),
+      _slice_results(),
+      _concat_results()
 {
 }
 
-void NEPadLayer::configure_constant_mode(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value)
+void NEPadLayer::configure_constant_mode(ITensor           *input,
+                                         ITensor           *output,
+                                         const PaddingList &padding,
+                                         const PixelValue   constant_value)
 {
     _pad_kernel = std::make_unique<NEPadLayerKernel>();
     _pad_kernel->configure(input, output, padding, constant_value, PaddingMode::CONSTANT);
@@ -85,20 +96,20 @@
     Coordinates ends_after{};
     Coordinates strides{};
     ITensor    *prev = input;
-    for(uint32_t i = 0; i < _num_dimensions; ++i)
+    for (uint32_t i = 0; i < _num_dimensions; ++i)
     {
         // Values in strides from the previous dimensions need to be set to 1 to avoid reversing again.
-        if(i > 0)
+        if (i > 0)
         {
             strides.set(i - 1, 1);
         }
 
-        if(_padding[i].first > 0 || _padding[i].second > 0)
+        if (_padding[i].first > 0 || _padding[i].second > 0)
         {
             // Set the starts, ends, and strides values for the current dimension.
             // Due to the bit masks passed to strided slice, the values below the current dimension in
             // starts and ends will be ignored so do not need to be modified.
-            if(_mode == PaddingMode::REFLECT)
+            if (_mode == PaddingMode::REFLECT)
             {
                 starts_before.set(i, _padding[i].first);
                 ends_before.set(i, 0);
@@ -124,11 +135,12 @@
 
             // Reflect the input values for the padding before and after the input.
             std::vector<const ITensor *> concat_vector;
-            if(_padding[i].first > 0)
+            if (_padding[i].first > 0)
             {
-                if(i < prev->info()->num_dimensions())
+                if (i < prev->info()->num_dimensions())
                 {
-                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides, begin_mask_before, end_mask_before);
+                    _slice_functions[2 * i].configure(prev, &_slice_results[2 * i], starts_before, ends_before, strides,
+                                                      begin_mask_before, end_mask_before);
                     concat_vector.emplace_back(&_slice_results[2 * i]);
                 }
                 else
@@ -138,11 +150,12 @@
                 }
             }
             concat_vector.push_back(prev);
-            if(_padding[i].second > 0)
+            if (_padding[i].second > 0)
             {
-                if(i < prev->info()->num_dimensions())
+                if (i < prev->info()->num_dimensions())
                 {
-                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after, strides, begin_mask_after, end_mask_after);
+                    _slice_functions[2 * i + 1].configure(prev, &_slice_results[2 * i + 1], starts_after, ends_after,
+                                                          strides, begin_mask_after, end_mask_after);
                     concat_vector.emplace_back(&_slice_results[2 * i + 1]);
                 }
                 else
@@ -154,12 +167,12 @@
             // Concatenate the padding before and after with the input.
             ITensor *out = (i == _num_dimensions - 1) ? output : &_concat_results[i];
             out->info()->set_quantization_info(output->info()->quantization_info());
-            for(auto &v : concat_vector)
+            for (auto &v : concat_vector)
             {
                 v->info()->set_quantization_info(input->info()->quantization_info());
             }
             _concat_functions[i].configure(concat_vector, out, i);
-            if(i != _num_dimensions - 1)
+            if (i != _num_dimensions - 1)
             {
                 _concat_results[i].allocator()->allocate();
             }
@@ -170,7 +183,11 @@
     }
 }
 
-void NEPadLayer::configure(ITensor *input, ITensor *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+void NEPadLayer::configure(ITensor           *input,
+                           ITensor           *output,
+                           const PaddingList &padding,
+                           const PixelValue   constant_value,
+                           const PaddingMode  mode)
 {
     ARM_COMPUTE_ERROR_THROW_ON(validate(input->info(), output->info(), padding, constant_value, mode));
     ARM_COMPUTE_LOG_PARAMS(input, output, padding, constant_value, mode);
@@ -178,15 +195,16 @@
     _padding = padding;
     _mode    = mode;
 
-    const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
+    const TensorShape padded_shape =
+        misc::shape_calculator::compute_padded_shape(input->info()->tensor_shape(), _padding);
 
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(padded_shape));
 
     // Find the last dimension requiring padding so that it is known when to write to output and whether any padding is applied.
     _num_dimensions = last_padding_dimension(padding) + 1;
-    if(_num_dimensions > 0)
+    if (_num_dimensions > 0)
     {
-        switch(_mode)
+        switch (_mode)
         {
             case PaddingMode::CONSTANT:
             {
@@ -210,19 +228,23 @@
     }
 }
 
-Status NEPadLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PaddingList &padding, const PixelValue constant_value, const PaddingMode mode)
+Status NEPadLayer::validate(const ITensorInfo *input,
+                            const ITensorInfo *output,
+                            const PaddingList &padding,
+                            const PixelValue   constant_value,
+                            const PaddingMode  mode)
 {
     ARM_COMPUTE_UNUSED(constant_value);
 
     const TensorShape padded_shape = misc::shape_calculator::compute_padded_shape(input->tensor_shape(), padding);
 
-    if(output->total_size() > 0)
+    if (output->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), padded_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
-    switch(mode)
+    switch (mode)
     {
         case PaddingMode::CONSTANT:
         {
@@ -231,9 +253,9 @@
         case PaddingMode::REFLECT:
         case PaddingMode::SYMMETRIC:
         {
-            for(uint32_t i = 0; i < padding.size(); ++i)
+            for (uint32_t i = 0; i < padding.size(); ++i)
             {
-                if(mode == PaddingMode::REFLECT)
+                if (mode == PaddingMode::REFLECT)
                 {
                     ARM_COMPUTE_RETURN_ERROR_ON(padding[i].first >= input->dimension(i));
                     ARM_COMPUTE_RETURN_ERROR_ON(padding[i].second >= input->dimension(i));
@@ -256,9 +278,9 @@
 
 void NEPadLayer::run()
 {
-    if(_num_dimensions > 0)
+    if (_num_dimensions > 0)
     {
-        switch(_mode)
+        switch (_mode)
         {
             case PaddingMode::CONSTANT:
             {
@@ -268,15 +290,15 @@
             case PaddingMode::REFLECT:
             case PaddingMode::SYMMETRIC:
             {
-                for(uint32_t i = 0; i < _num_dimensions; ++i)
+                for (uint32_t i = 0; i < _num_dimensions; ++i)
                 {
-                    if(_padding[i].first > 0 || _padding[i].second > 0)
+                    if (_padding[i].first > 0 || _padding[i].second > 0)
                     {
-                        if(_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
+                        if (_padding[i].first > 0 && _slice_results[2 * i].info()->total_size() > 0)
                         {
                             _slice_functions[2 * i].run();
                         }
-                        if(_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
+                        if (_padding[i].second > 0 && _slice_results[2 * i + 1].info()->total_size() > 0)
                         {
                             _slice_functions[2 * i + 1].run();
                         }

diff --git a/src/runtime/NEON/functions/NEPermute.cpp b/src/runtime/NEON/functions/NEPermute.cpp
index 517b86a..80cd04c 100644
--- a/src/runtime/NEON/functions/NEPermute.cpp
+++ b/src/runtime/NEON/functions/NEPermute.cpp

@@ -24,19 +24,19 @@
 #include "arm_compute/runtime/NEON/functions/NEPermute.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuPermute.h"
 
 namespace arm_compute
 {
 struct NEPermute::Impl
 {
-    const ITensor                   *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    std::unique_ptr<cpu::CpuPermute> op{ nullptr };
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuPermute> op{nullptr};
 };
 
-NEPermute::NEPermute()
-    : _impl(std::make_unique<Impl>())
+NEPermute::NEPermute() : _impl(std::make_unique<Impl>())
 {
 }
 

diff --git a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
index ad83a26..97155a9 100644
--- a/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp
+++ b/src/runtime/NEON/functions/NEPixelWiseMultiplication.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h"
 
 #include "arm_compute/core/ITensor.h"
+
 #include "src/cpu/operators/CpuMul.h"
 
 #include <utility>
@@ -32,32 +33,42 @@
 {
 struct NEPixelWiseMultiplication::Impl
 {
-    const ITensor               *src_0{ nullptr };
-    const ITensor               *src_1{ nullptr };
-    ITensor                     *dst{ nullptr };
-    std::unique_ptr<cpu::CpuMul> op{ nullptr };
+    const ITensor               *src_0{nullptr};
+    const ITensor               *src_1{nullptr};
+    ITensor                     *dst{nullptr};
+    std::unique_ptr<cpu::CpuMul> op{nullptr};
 };
 
-NEPixelWiseMultiplication::NEPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+NEPixelWiseMultiplication::NEPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 NEPixelWiseMultiplication::~NEPixelWiseMultiplication() = default;
 
-Status NEPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+Status NEPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                           const ITensorInfo         *input2,
+                                           const ITensorInfo         *output,
+                                           float                      scale,
+                                           ConvertPolicy              overflow_policy,
+                                           RoundingPolicy             rounding_policy,
                                            const ActivationLayerInfo &act_info)
 {
     return cpu::CpuMul::validate(input1, input2, output, scale, overflow_policy, rounding_policy, act_info);
 }
 
-void NEPixelWiseMultiplication::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy,
+void NEPixelWiseMultiplication::configure(const ITensor             *input1,
+                                          const ITensor             *input2,
+                                          ITensor                   *output,
+                                          float                      scale,
+                                          ConvertPolicy              overflow_policy,
+                                          RoundingPolicy             rounding_policy,
                                           const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;
     _impl->dst   = output;
     _impl->op    = std::make_unique<cpu::CpuMul>();
-    _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy, act_info);
+    _impl->op->configure(input1->info(), input2->info(), output->info(), scale, overflow_policy, rounding_policy,
+                         act_info);
 }
 
 void NEPixelWiseMultiplication::run()
@@ -71,24 +82,29 @@
 
 struct NEComplexPixelWiseMultiplication::Impl
 {
-    ITensor                            *src_0{ nullptr };
-    ITensor                            *src_1{ nullptr };
-    ITensor                            *dst{ nullptr };
-    std::unique_ptr<cpu::CpuComplexMul> op{ nullptr };
+    ITensor                            *src_0{nullptr};
+    ITensor                            *src_1{nullptr};
+    ITensor                            *dst{nullptr};
+    std::unique_ptr<cpu::CpuComplexMul> op{nullptr};
 };
 
-NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication()
-    : _impl(std::make_unique<Impl>())
+NEComplexPixelWiseMultiplication::NEComplexPixelWiseMultiplication() : _impl(std::make_unique<Impl>())
 {
 }
 NEComplexPixelWiseMultiplication::~NEComplexPixelWiseMultiplication() = default;
 
-Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ActivationLayerInfo &act_info)
+Status NEComplexPixelWiseMultiplication::validate(const ITensorInfo         *input1,
+                                                  const ITensorInfo         *input2,
+                                                  const ITensorInfo         *output,
+                                                  const ActivationLayerInfo &act_info)
 {
     return cpu::CpuComplexMul::validate(input1, input2, output, act_info);
 }
 
-void NEComplexPixelWiseMultiplication::configure(ITensor *input1, ITensor *input2, ITensor *output, const ActivationLayerInfo &act_info)
+void NEComplexPixelWiseMultiplication::configure(ITensor                   *input1,
+                                                 ITensor                   *input2,
+                                                 ITensor                   *output,
+                                                 const ActivationLayerInfo &act_info)
 {
     _impl->src_0 = input1;
     _impl->src_1 = input2;

diff --git a/src/runtime/NEON/functions/NEPooling3dLayer.cpp b/src/runtime/NEON/functions/NEPooling3dLayer.cpp
index 53f9dbf0..e017e8c 100644
--- a/src/runtime/NEON/functions/NEPooling3dLayer.cpp
+++ b/src/runtime/NEON/functions/NEPooling3dLayer.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuPool3d.h"
 
@@ -33,9 +34,9 @@
 {
 struct NEPooling3dLayer::Impl
 {
-    const ITensor                  *src{ nullptr };
-    ITensor                        *dst{ nullptr };
-    std::unique_ptr<cpu::CpuPool3d> op{ nullptr };
+    const ITensor                  *src{nullptr};
+    ITensor                        *dst{nullptr};
+    std::unique_ptr<cpu::CpuPool3d> op{nullptr};
     MemoryGroup                     memory_group{};
     ITensorPack                     run_pack{};
     WorkspaceData<Tensor>           workspace_tensors{};
@@ -43,8 +44,7 @@
 
 NEPooling3dLayer::~NEPooling3dLayer() = default;
 
-NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEPooling3dLayer::NEPooling3dLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
@@ -56,11 +56,12 @@
     _impl->op  = std::make_unique<cpu::CpuPool3d>();
     _impl->op->configure(input->info(), output->info(), pool_info);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST_0, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
+Status
+NEPooling3dLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const Pooling3dLayerInfo &pool_info)
 {
     return cpu::CpuPool3d::validate(input, output, pool_info);
 }
@@ -72,4 +73,4 @@
     _impl->op->run(_impl->run_pack);
 }
 
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEPoolingLayer.cpp b/src/runtime/NEON/functions/NEPoolingLayer.cpp
index 5a3b9c5..eb9125b 100644
--- a/src/runtime/NEON/functions/NEPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEPoolingLayer.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/cpu/operators/CpuPool2d.h"
 
@@ -33,10 +34,10 @@
 {
 struct NEPoolingLayer::Impl
 {
-    ITensor                        *src{ nullptr };
-    ITensor                        *dst{ nullptr };
-    ITensor                        *indices{ nullptr };
-    std::unique_ptr<cpu::CpuPool2d> op{ nullptr };
+    ITensor                        *src{nullptr};
+    ITensor                        *dst{nullptr};
+    ITensor                        *indices{nullptr};
+    std::unique_ptr<cpu::CpuPool2d> op{nullptr};
     MemoryGroup                     memory_group{};
     ITensorPack                     run_pack{};
     WorkspaceData<Tensor>           workspace_tensors{};
@@ -44,8 +45,7 @@
 
 NEPoolingLayer::~NEPoolingLayer() = default;
 
-NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _impl(std::make_unique<Impl>())
+NEPoolingLayer::NEPoolingLayer(std::shared_ptr<IMemoryManager> memory_manager) : _impl(std::make_unique<Impl>())
 {
     _impl->memory_group = MemoryGroup(std::move(memory_manager));
 }
@@ -58,11 +58,16 @@
     _impl->op      = std::make_unique<cpu::CpuPool2d>();
     _impl->op->configure(input->info(), output->info(), pool_info, (indices) ? indices->info() : nullptr);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST_0, _impl->dst }, { TensorType::ACL_DST_1, _impl->indices } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src},
+                                {TensorType::ACL_DST_0, _impl->dst},
+                                {TensorType::ACL_DST_1, _impl->indices}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
-Status NEPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info, const ITensorInfo *indices)
+Status NEPoolingLayer::validate(const ITensorInfo      *input,
+                                const ITensorInfo      *output,
+                                const PoolingLayerInfo &pool_info,
+                                const ITensorInfo      *indices)
 {
     return cpu::CpuPool2d::validate(input, output, pool_info, indices);
 }

diff --git a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
index aba0923..dbb6bf9 100644
--- a/src/runtime/NEON/functions/NEPriorBoxLayer.cpp
+++ b/src/runtime/NEON/functions/NEPriorBoxLayer.cpp

@@ -27,15 +27,19 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEPriorBoxLayerKernel.h"
 
 namespace arm_compute
 {
-void NEPriorBoxLayer::configure(const ITensor *input1, const ITensor *input2, ITensor *output, const PriorBoxLayerInfo &info)
+void NEPriorBoxLayer::configure(const ITensor           *input1,
+                                const ITensor           *input2,
+                                ITensor                 *output,
+                                const PriorBoxLayerInfo &info)
 {
     ARM_COMPUTE_LOG_PARAMS(input1, input2, output, info);
 
@@ -44,7 +48,10 @@
     _kernel = std::move(k);
 }
 
-Status NEPriorBoxLayer::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const PriorBoxLayerInfo &info)
+Status NEPriorBoxLayer::validate(const ITensorInfo       *input1,
+                                 const ITensorInfo       *input2,
+                                 const ITensorInfo       *output,
+                                 const PriorBoxLayerInfo &info)
 {
     return NEPriorBoxLayerKernel::validate(input1, input2, output, info);
 }

diff --git a/src/runtime/NEON/functions/NEQLSTMLayer.cpp b/src/runtime/NEON/functions/NEQLSTMLayer.cpp
index 2caaea0..dd78d10 100644
--- a/src/runtime/NEON/functions/NEQLSTMLayer.cpp
+++ b/src/runtime/NEON/functions/NEQLSTMLayer.cpp

@@ -27,13 +27,14 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/QuantizationInfo.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "src/core/helpers/WindowHelpers.h"
+#include "src/core/NEON/kernels/NEQLSTMLayerNormalizationKernel.h"
 #include "src/cpu/kernels/CpuGemmLowpMatrixReductionKernel.h"
 
 namespace arm_compute
@@ -41,12 +42,19 @@
 using namespace arm_compute::utils::info_helpers;
 namespace
 {
-Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info, const ITensorInfo *mm_input, const ITensorInfo *mm_weights, const ITensorInfo *bias,
-                   float gemmlowp_scale, const TensorInfo *mm_res_info, const TensorInfo *outstage_tensor_info)
+Status validate_mm(GEMMLowpOutputStageInfo &gemmlowp_info,
+                   const ITensorInfo       *mm_input,
+                   const ITensorInfo       *mm_weights,
+                   const ITensorInfo       *bias,
+                   float                    gemmlowp_scale,
+                   const TensorInfo        *mm_res_info,
+                   const TensorInfo        *outstage_tensor_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpMatrixMultiplyCore::validate(mm_input, mm_weights, nullptr, mm_res_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+        gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpOutputStage::validate(mm_res_info, bias, outstage_tensor_info, gemmlowp_info));
     return Status{};
 }
 } // namespace
@@ -55,10 +63,7 @@
 {
     // Output quantization scale will be different, but ignored here
     // since it will be configured at configure() stage.
-    const TensorInfo out
-    {
-        in
-    };
+    const TensorInfo out{in};
     return NEQLSTMLayerNormalizationKernel::validate(&in, &out, &weight, &bias);
 }
 
@@ -98,14 +103,12 @@
 
 void NEQLSTMLayer::TensorCopyKernel::run()
 {
-    Iterator input_iter{ _src, _window };
-    Iterator output_iter{ _dst, _window };
+    Iterator input_iter{_src, _window};
+    Iterator output_iter{_dst, _window};
 
-    execute_window_loop(_window, [&](const Coordinates &)
-    {
-        memcpy(output_iter.ptr(), input_iter.ptr(), _row_size);
-    },
-    input_iter, output_iter);
+    execute_window_loop(
+        _window, [&](const Coordinates &) { memcpy(output_iter.ptr(), input_iter.ptr(), _row_size); }, input_iter,
+        output_iter);
 }
 
 NEQLSTMLayer::~NEQLSTMLayer() = default;
@@ -191,10 +194,17 @@
     _memory_group = MemoryGroup(std::move(memory_manager));
 }
 
-void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm, NEGEMMLowpOutputStage &outstage, GEMMLowpOutputStageInfo &gemmlowp_info,
-                                const ITensor *mm_input, const ITensor *mm_weights, const ITensor *bias,
-                                Tensor *mm_res, Tensor *outstage_res, float gemmlowp_scale,
-                                const TensorInfo &mm_res_info, const TensorInfo &outstage_tensor_info)
+void NEQLSTMLayer::configure_mm(NEGEMMLowpMatrixMultiplyCore &mm,
+                                NEGEMMLowpOutputStage        &outstage,
+                                GEMMLowpOutputStageInfo      &gemmlowp_info,
+                                const ITensor                *mm_input,
+                                const ITensor                *mm_weights,
+                                const ITensor                *bias,
+                                Tensor                       *mm_res,
+                                Tensor                       *outstage_res,
+                                float                         gemmlowp_scale,
+                                const TensorInfo             &mm_res_info,
+                                const TensorInfo             &outstage_tensor_info)
 {
     _memory_group.manage(mm_res);
     _memory_group.manage(outstage_res);
@@ -206,66 +216,87 @@
     mm.configure(mm_input, mm_weights, nullptr, mm_res);
 
     // Configure output stage
-    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
+    quantization::calculate_quantized_multiplier(gemmlowp_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift);
     outstage.configure(mm_res, bias, outstage_res, gemmlowp_info);
     mm_res->allocator()->allocate();
 }
 
-void NEQLSTMLayer::configure(const ITensor *input,
-                             const ITensor *input_to_forget_weights, const ITensor *input_to_cell_weights, const ITensor *input_to_output_weights,
-                             const ITensor *recurrent_to_forget_weights, const ITensor *recurrent_to_cell_weights, const ITensor *recurrent_to_output_weights,
-                             const ITensor *forget_gate_bias, const ITensor *cell_bias, const ITensor *output_gate_bias,
-                             const ITensor *cell_state_in, ITensor *output_state_in,
-                             ITensor *cell_state_out, ITensor *output_state_out, ITensor *output,
+void NEQLSTMLayer::configure(const ITensor             *input,
+                             const ITensor             *input_to_forget_weights,
+                             const ITensor             *input_to_cell_weights,
+                             const ITensor             *input_to_output_weights,
+                             const ITensor             *recurrent_to_forget_weights,
+                             const ITensor             *recurrent_to_cell_weights,
+                             const ITensor             *recurrent_to_output_weights,
+                             const ITensor             *forget_gate_bias,
+                             const ITensor             *cell_bias,
+                             const ITensor             *output_gate_bias,
+                             const ITensor             *cell_state_in,
+                             ITensor                   *output_state_in,
+                             ITensor                   *cell_state_out,
+                             ITensor                   *output_state_out,
+                             ITensor                   *output,
                              const LSTMParams<ITensor> &lstm_params)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                                  recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+                                 forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                                 cell_state_out, output_state_out);
 
     ARM_COMPUTE_LOG_PARAMS(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
                            recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights,
-                           forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in, cell_state_out, output_state_out);
+                           forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
+                           cell_state_out, output_state_out);
 
     // Set lstm parameters
     LSTMParams<ITensorInfo> lstm_params_info{};
     build_lstm_params_tensor_info(lstm_params, &lstm_params_info);
 
-    _input_to_forget_weights_transposed.info()->set_quantization_info(input_to_forget_weights->info()->quantization_info());
+    _input_to_forget_weights_transposed.info()->set_quantization_info(
+        input_to_forget_weights->info()->quantization_info());
     _input_to_cell_weights_transposed.info()->set_quantization_info(input_to_cell_weights->info()->quantization_info());
-    _input_to_output_weights_transposed.info()->set_quantization_info(input_to_output_weights->info()->quantization_info());
-    _recurrent_to_forget_weights_transposed.info()->set_quantization_info(recurrent_to_forget_weights->info()->quantization_info());
-    _recurrent_to_cell_weights_transposed.info()->set_quantization_info(recurrent_to_cell_weights->info()->quantization_info());
-    _recurrent_to_output_weights_transposed.info()->set_quantization_info(recurrent_to_output_weights->info()->quantization_info());
+    _input_to_output_weights_transposed.info()->set_quantization_info(
+        input_to_output_weights->info()->quantization_info());
+    _recurrent_to_forget_weights_transposed.info()->set_quantization_info(
+        recurrent_to_forget_weights->info()->quantization_info());
+    _recurrent_to_cell_weights_transposed.info()->set_quantization_info(
+        recurrent_to_cell_weights->info()->quantization_info());
+    _recurrent_to_output_weights_transposed.info()->set_quantization_info(
+        recurrent_to_output_weights->info()->quantization_info());
 
-    if(input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+    if (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
     {
         _convert_input_to_forget_weights_to_qsymm8 = true;
         // Setup dequantize output tensor to go from QASYMM8_SIGNED -> F32
 
-        _input_to_forget_weights_f32.allocator()->init(TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
-                                                       .set_data_layout(input_to_forget_weights->info()->data_layout()));
+        _input_to_forget_weights_f32.allocator()->init(
+            TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::F32)
+                .set_data_layout(input_to_forget_weights->info()->data_layout()));
         // Setup the quantize output tensor to go from F32 -> QSYMM8
-        _input_to_forget_weights_symm8.allocator()->init((TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
-                                                          .set_data_layout(input_to_forget_weights->info()->data_layout())
-                                                          .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
+        _input_to_forget_weights_symm8.allocator()->init(
+            (TensorInfo(input_to_forget_weights->info()->tensor_shape(), 1, DataType::QSYMM8)
+                 .set_data_layout(input_to_forget_weights->info()->data_layout())
+                 .set_quantization_info(input_to_forget_weights->info()->quantization_info())));
 
         _dequantize_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_f32);
         _quantize_input_to_forget_weights.configure(&_input_to_forget_weights_f32, &_input_to_forget_weights_symm8);
 
-        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                          recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                          forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                          cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                          lstm_params_info));
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+            input->info(), _input_to_forget_weights_symm8.info(), input_to_cell_weights->info(),
+            input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+            recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+            cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+            output->info(), lstm_params_info));
     }
     else
     {
-        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(), input_to_output_weights->info(),
-                                                          recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(), recurrent_to_output_weights->info(),
-                                                          forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
-                                                          cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(), output->info(),
-                                                          lstm_params_info));
+        ARM_COMPUTE_ERROR_THROW_ON(NEQLSTMLayer::validate(
+            input->info(), input_to_forget_weights->info(), input_to_cell_weights->info(),
+            input_to_output_weights->info(), recurrent_to_forget_weights->info(), recurrent_to_cell_weights->info(),
+            recurrent_to_output_weights->info(), forget_gate_bias->info(), cell_bias->info(), output_gate_bias->info(),
+            cell_state_in->info(), output_state_in->info(), cell_state_out->info(), output_state_out->info(),
+            output->info(), lstm_params_info));
     }
 
     const int batch_size  = input->info()->dimension(1);
@@ -277,7 +308,9 @@
     const UniformQuantizationInfo qoutput_state_in = output_state_in->info()->quantization_info().uniform();
 
     _projection_bias             = lstm_params.projection_bias();
-    _input_to_forget_weights     = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED) ? &_input_to_forget_weights_symm8 : input_to_forget_weights;
+    _input_to_forget_weights     = (input_to_forget_weights->info()->data_type() == DataType::QASYMM8_SIGNED)
+                                       ? &_input_to_forget_weights_symm8
+                                       : input_to_forget_weights;
     _input_to_cell_weights       = input_to_cell_weights;
     _input_to_output_weights     = input_to_output_weights;
     _recurrent_to_forget_weights = recurrent_to_forget_weights;
@@ -287,7 +320,7 @@
 
     // Layer normalization
     _has_layer_norm = lstm_params.use_layer_norm();
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         set_layer_norm_weight(lstm_params.forget_layer_norm_weights(), LayerNormGate::Forget);
         set_layer_norm_weight(lstm_params.cell_layer_norm_weights(), LayerNormGate::Cell);
@@ -309,22 +342,25 @@
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
     _has_cell_clipping = quantized_cell_clip > 0;
 
     // Precompute effective bias for optimizing the matmul computations.
-    if(!_has_cifg)
+    if (!_has_cifg)
     {
         _input_to_input_weights     = lstm_params.input_to_input_weights();
         _recurrent_to_input_weights = lstm_params.recurrent_to_input_weights();
 
         _input_to_input_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
         _recurrent_to_input_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
-        _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-        _recurrent_to_input_reduction->configure(_recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+        _input_to_input_reduction->configure(_input_to_input_weights->info(), _input_to_input_eff_bias.info(),
+                                             GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+        _recurrent_to_input_reduction->configure(
+            _recurrent_to_input_weights->info(), _recurrent_to_input_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
     }
 
     _input_to_forget_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
@@ -334,19 +370,31 @@
     _input_to_output_reduction     = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
     _recurrent_to_output_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
 
-    _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_forget_reduction->configure(recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_cell_reduction->configure(recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
-    _recurrent_to_output_reduction->configure(recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(), GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
-    if(_has_projection)
+    _input_to_forget_reduction->configure(input_to_forget_weights->info(), _input_to_forget_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_forget_reduction->configure(
+        recurrent_to_forget_weights->info(), _recurrent_to_forget_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_cell_reduction->configure(input_to_cell_weights->info(), _input_to_cell_eff_bias.info(),
+                                        GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_cell_reduction->configure(
+        recurrent_to_cell_weights->info(), _recurrent_to_cell_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    _input_to_output_reduction->configure(input_to_output_weights->info(), _input_to_output_eff_bias.info(),
+                                          GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true));
+    _recurrent_to_output_reduction->configure(
+        recurrent_to_output_weights->info(), _recurrent_to_output_eff_bias.info(),
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true));
+    if (_has_projection)
     {
         _projection_reduction = std::make_unique<cpu::kernels::CpuGemmLowpMatrixAReductionKernel>();
-        _projection_reduction->configure(_projection_weights->info(), _projection_eff_bias.info(), GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
-        if(_projection_bias != nullptr)
+        _projection_reduction->configure(
+            _projection_weights->info(), _projection_eff_bias.info(),
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true));
+        if (_projection_bias != nullptr)
         {
-            _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias, ConvertPolicy::SATURATE);
+            _projection_bias_add.configure(_projection_bias, &_projection_eff_bias, &_projection_eff_bias,
+                                           ConvertPolicy::SATURATE);
         }
     }
 
@@ -354,15 +402,19 @@
     _transpose_input_to_forget_weights.configure(input_to_forget_weights, &_input_to_forget_weights_transposed);
     _transpose_input_to_cell_weights.configure(input_to_cell_weights, &_input_to_cell_weights_transposed);
     _transpose_input_to_output_weights.configure(input_to_output_weights, &_input_to_output_weights_transposed);
-    _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights, &_recurrent_to_forget_weights_transposed);
+    _transpose_recurrent_to_forget_weights.configure(recurrent_to_forget_weights,
+                                                     &_recurrent_to_forget_weights_transposed);
     _transpose_recurrent_to_cell_weights.configure(recurrent_to_cell_weights, &_recurrent_to_cell_weights_transposed);
-    _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights, &_recurrent_to_output_weights_transposed);
-    if(!_has_cifg)
+    _transpose_recurrent_to_output_weights.configure(recurrent_to_output_weights,
+                                                     &_recurrent_to_output_weights_transposed);
+    if (!_has_cifg)
     {
-        _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(), &_input_to_input_weights_transposed);
-        _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(), &_recurrent_to_input_weights_transposed);
+        _transpose_input_to_input_weights.configure(lstm_params.input_to_input_weights(),
+                                                    &_input_to_input_weights_transposed);
+        _transpose_recurrent_to_input_weights.configure(lstm_params.recurrent_to_input_weights(),
+                                                        &_recurrent_to_input_weights_transposed);
     }
-    if(_has_projection)
+    if (_has_projection)
     {
         _transpose_projection_weights.configure(_projection_weights, &_projection_weights_transposed);
     }
@@ -375,40 +427,52 @@
 
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
     // Forget gate.
-    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
-    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info,
-                 input, &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias,
-                 &_mm_input_to_forget_res, &_input_to_forget_outstage_res, input_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
+    const TensorInfo forget_gate_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                               QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const float      input_to_forget_scale = input_to_forget_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(_mm_input_to_forget, _input_to_forget_outstage, gemmlowp_info, input,
+                 &_input_to_forget_weights_transposed, &_input_to_forget_eff_bias, &_mm_input_to_forget_res,
+                 &_input_to_forget_outstage_res, input_to_forget_scale, mm_out_info, forget_gate_outstage_info);
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias,
-                 &_mm_recurrent_to_forget_res, &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale,
-                 mm_out_info, forget_gate_outstage_info);
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    configure_mm(_mm_recurrent_to_forget, _recurrent_to_forget_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_forget_weights_transposed, &_recurrent_to_forget_eff_bias, &_mm_recurrent_to_forget_res,
+                 &_recurrent_to_forget_outstage_res, recurrent_to_forget_scale, mm_out_info, forget_gate_outstage_info);
 
-    _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+    _accumulate_input_recurrent_forget.configure(&_input_to_forget_outstage_res, &_recurrent_to_forget_outstage_res,
+                                                 &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
     _input_to_forget_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _mul_cell_to_forget_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_forget_res);
-        _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(), &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-        _cell_to_forget_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
+        _pixelwise_mul_cell_to_forget.configure(cell_state_in, lstm_params.cell_to_forget_weights(),
+                                                &_mul_cell_to_forget_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
+        _cell_to_forget_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_forget_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.forget_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_forget_outstage_res);
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res, gemmlowp_info);
+        const float cell_to_forget_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_forget_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.forget_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_forget_outstage.configure(&_mul_cell_to_forget_res, nullptr, &_cell_to_forget_outstage_res,
+                                           gemmlowp_info);
         _mul_cell_to_forget_res.allocator()->allocate();
-        _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res, &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_cell_forget.configure(&_recurrent_to_forget_outstage_res, &_cell_to_forget_outstage_res,
+                                          &_recurrent_to_forget_outstage_res, ConvertPolicy::SATURATE);
         _cell_to_forget_outstage_res.allocator()->allocate();
     }
 
     Tensor *forget_activation_input = &_recurrent_to_forget_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Forget, forget_activation_input);
         forget_activation_input->allocator()->allocate();
@@ -417,33 +481,36 @@
 
     // Output quantization info of Sigmoid and Tanh activations
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
-    const TensorInfo       forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+    const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
     _memory_group.manage(&_forget_gate);
     _forget_gate.allocator()->init(forget_gate_info);
-    _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _forget_gate_sigmoid.configure(forget_activation_input, &_forget_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     forget_activation_input->allocator()->allocate();
 
     // Modulation gate.
-    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info,
-                 input, &_input_to_cell_weights_transposed, &_input_to_cell_eff_bias,
-                 &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
+    const TensorInfo cell_outstage_info(mm_out_info.tensor_shape(), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->info()->quantization_info().uniform().scale *
+                                      qinput.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(_mm_input_to_cell, _input_to_cell_outstage, gemmlowp_info, input, &_input_to_cell_weights_transposed,
+                 &_input_to_cell_eff_bias, &_mm_input_to_cell_res, &_input_to_cell_outstage_res, input_to_cell_scale,
                  mm_out_info, cell_outstage_info);
 
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias,
-                 &_mm_recurrent_to_cell_res, &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale,
-                 mm_out_info, cell_outstage_info);
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->info()->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    configure_mm(_mm_recurrent_to_cell, _recurrent_to_cell_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_cell_weights_transposed, &_recurrent_to_cell_eff_bias, &_mm_recurrent_to_cell_res,
+                 &_recurrent_to_cell_outstage_res, recurrent_to_cell_scale, mm_out_info, cell_outstage_info);
 
-    _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
+    _accumulate_input_recurrent_modulation.configure(&_input_to_cell_outstage_res, &_recurrent_to_cell_outstage_res,
+                                                     &_recurrent_to_cell_outstage_res, ConvertPolicy::SATURATE);
     _input_to_cell_outstage_res.allocator()->allocate();
 
     Tensor *cell_activation_input = &_recurrent_to_cell_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Cell, cell_activation_input);
         cell_activation_input->allocator()->allocate();
@@ -454,14 +521,15 @@
 
     _memory_group.manage(&_cell_gate);
     _cell_gate.allocator()->init(cell_gate_info);
-    _cell_gate_tanh.configure(cell_activation_input, &_cell_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _cell_gate_tanh.configure(cell_activation_input, &_cell_gate,
+                              ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     cell_activation_input->allocator()->allocate();
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
     _input_gate.allocator()->init(input_gate_info);
     _memory_group.manage(&_input_gate);
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _ones.allocator()->init(*_forget_gate.info());
         _input_gate_sub.configure(&_ones, &_forget_gate, &_input_gate, ConvertPolicy::SATURATE);
@@ -469,104 +537,137 @@
     }
     else
     {
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info,
-                     input, &_input_to_input_weights_transposed, &_input_to_input_eff_bias,
-                     &_mm_input_to_input_res, &_input_to_input_outstage_res, input_to_input_scale,
-                     mm_out_info, input_outstage_info);
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float      input_to_input_scale = _input_to_input_weights->info()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        configure_mm(_mm_input_to_input, _input_to_input_outstage, gemmlowp_info, input,
+                     &_input_to_input_weights_transposed, &_input_to_input_eff_bias, &_mm_input_to_input_res,
+                     &_input_to_input_outstage_res, input_to_input_scale, mm_out_info, input_outstage_info);
 
-        const float recurrent_to_input_scale = _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info,
-                     output_state_in, &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
+        const float recurrent_to_input_scale =
+            _recurrent_to_input_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        configure_mm(_mm_recurrent_to_input, _recurrent_to_input_outstage, gemmlowp_info, output_state_in,
+                     &_recurrent_to_input_weights_transposed, &_recurrent_to_input_eff_bias,
                      &_mm_recurrent_to_input_res, &_recurrent_to_input_outstage_res, recurrent_to_input_scale,
                      mm_out_info, input_outstage_info);
-        _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_input_recurrent_input.configure(&_input_to_input_outstage_res, &_recurrent_to_input_outstage_res,
+                                                    &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
         _input_to_input_outstage_res.allocator()->allocate();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
-            _mul_cell_to_input_res.allocator()->init(TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
+            _mul_cell_to_input_res.allocator()->init(
+                TensorInfo(cell_state_in->info()->tensor_shape(), 1, DataType::S32));
             _memory_group.manage(&_mul_cell_to_input_res);
-            _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-            _cell_to_input_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
+            _pixelwise_mul_cell_to_input.configure(cell_state_in, lstm_params.cell_to_input_weights(),
+                                                   &_mul_cell_to_input_res, 1.f, ConvertPolicy::SATURATE,
+                                                   RoundingPolicy::TO_ZERO);
+            const float cell_to_input_scale =
+                std::pow(2, cell_shift) *
+                lstm_params.cell_to_input_weights()->info()->quantization_info().uniform().scale /
+                lstm_params.input_intermediate_scale();
+            quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                         &gemmlowp_info.gemmlowp_shift);
+            _cell_to_input_outstage_res.allocator()->init(
+                TensorInfo(_mul_cell_to_input_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                           QuantizationInfo(lstm_params.input_intermediate_scale(), 0)));
             _memory_group.manage(&_cell_to_input_outstage_res);
-            _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res, gemmlowp_info);
+            _cell_to_input_outstage.configure(&_mul_cell_to_input_res, nullptr, &_cell_to_input_outstage_res,
+                                              gemmlowp_info);
             _mul_cell_to_input_res.allocator()->allocate();
-            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res, &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
+            _accumulate_cell_input.configure(&_recurrent_to_input_outstage_res, &_cell_to_input_outstage_res,
+                                             &_recurrent_to_input_outstage_res, ConvertPolicy::SATURATE);
             _cell_to_input_outstage_res.allocator()->allocate();
         }
 
         Tensor *input_activation_input = &_recurrent_to_input_outstage_res;
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             configure_layer_norm(LayerNormGate::Input, input_activation_input);
             input_activation_input->allocator()->allocate();
             input_activation_input = &get_layer_norm_output(LayerNormGate::Input);
         }
 
-        _input_gate_sigmoid.configure(input_activation_input, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+        _input_gate_sigmoid.configure(input_activation_input, &_input_gate,
+                                      ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
         input_activation_input->allocator()->allocate();
     }
     // Cell.
     // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
-    _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_forget_cell.configure(&_forget_gate, cell_state_in, &_forget_gate, 1.f, ConvertPolicy::SATURATE,
+                                         RoundingPolicy::TO_ZERO);
     const float      cell_gate_scale      = _cell_gate.info()->quantization_info().uniform().scale;
     const float      mul_input_cell_scale = cell_gate_scale * std::pow(2, 15 + cell_shift);
-    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(mul_input_cell_scale, 0));
+    const TensorInfo mul_input_cell_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                         QuantizationInfo(mul_input_cell_scale, 0));
     _memory_group.manage(&_mul_input_cell_res);
     _mul_input_cell_res.allocator()->init(mul_input_cell_info);
-    _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_input_cell.configure(&_input_gate, &_cell_gate, &_mul_input_cell_res, 1.f, ConvertPolicy::SATURATE,
+                                        RoundingPolicy::TO_ZERO);
     _cell_gate.allocator()->allocate();
     _add_forget_cell.configure(&_forget_gate, &_mul_input_cell_res, cell_state_out, ConvertPolicy::SATURATE);
     _mul_input_cell_res.allocator()->allocate();
     _forget_gate.allocator()->allocate();
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
-        _cell_clip.configure(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip, quantized_cell_clip));
+        _cell_clip.configure(cell_state_out, nullptr,
+                             ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                 -quantized_cell_clip, quantized_cell_clip));
     }
     // Output gate.
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info,
-                 input, &_input_to_output_weights_transposed, &_input_to_output_eff_bias,
-                 &_mm_input_to_output_res, &_input_to_output_outstage_res, input_to_output_scale,
-                 mm_out_info, output_outstage_info);
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float      input_to_output_scale = input_to_output_weights->info()->quantization_info().uniform().scale *
+                                        qinput.scale / lstm_params.output_intermediate_scale();
+    configure_mm(_mm_input_to_output, _input_to_output_outstage, gemmlowp_info, input,
+                 &_input_to_output_weights_transposed, &_input_to_output_eff_bias, &_mm_input_to_output_res,
+                 &_input_to_output_outstage_res, input_to_output_scale, mm_out_info, output_outstage_info);
 
-    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info,
-                 output_state_in, &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias,
-                 &_mm_recurrent_to_output_res, &_recurrent_to_output_outstage_res, recurrent_to_output_scale,
-                 mm_out_info, output_outstage_info);
+    const float recurrent_to_output_scale = recurrent_to_output_weights->info()->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    configure_mm(_mm_recurrent_to_output, _recurrent_to_output_outstage, gemmlowp_info, output_state_in,
+                 &_recurrent_to_output_weights_transposed, &_recurrent_to_output_eff_bias, &_mm_recurrent_to_output_res,
+                 &_recurrent_to_output_outstage_res, recurrent_to_output_scale, mm_out_info, output_outstage_info);
 
-    _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+    _accumulate_input_recurrent_output.configure(&_recurrent_to_output_outstage_res, &_input_to_output_outstage_res,
+                                                 &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
     _input_to_output_outstage_res.allocator()->allocate();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         _mul_cell_to_output_res.allocator()->init(TensorInfo(cell_state_out->info()->tensor_shape(), 1, DataType::S32));
         _memory_group.manage(&_mul_cell_to_output_res);
-        _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(), &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+        _pixelwise_mul_cell_to_output.configure(cell_state_out, lstm_params.cell_to_output_weights(),
+                                                &_mul_cell_to_output_res, 1.f, ConvertPolicy::SATURATE,
+                                                RoundingPolicy::TO_ZERO);
 
-        const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
-        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift);
-        _cell_to_output_outstage_res.allocator()->init(TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
+        const float cell_to_output_scale =
+            std::pow(2, cell_shift) *
+            lstm_params.cell_to_output_weights()->info()->quantization_info().uniform().scale /
+            lstm_params.output_intermediate_scale();
+        quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift);
+        _cell_to_output_outstage_res.allocator()->init(
+            TensorInfo(_mul_cell_to_output_res.info()->tensor_shape(), 1, DataType::QSYMM16,
+                       QuantizationInfo(lstm_params.output_intermediate_scale(), 0)));
         _memory_group.manage(&_cell_to_output_outstage_res);
-        _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res, gemmlowp_info);
+        _cell_to_output_outstage.configure(&_mul_cell_to_output_res, nullptr, &_cell_to_output_outstage_res,
+                                           gemmlowp_info);
         _mul_cell_to_output_res.allocator()->allocate();
 
-        _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res, &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
+        _accumulate_cell_to_output.configure(&_recurrent_to_output_outstage_res, &_cell_to_output_outstage_res,
+                                             &_recurrent_to_output_outstage_res, ConvertPolicy::SATURATE);
         _cell_to_output_outstage_res.allocator()->allocate();
     }
 
     Tensor *output_activation_input = &_recurrent_to_output_outstage_res;
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         configure_layer_norm(LayerNormGate::Output, output_activation_input);
         output_activation_input->allocator()->allocate();
@@ -576,20 +677,24 @@
 
     _memory_group.manage(&_output_gate);
     _output_gate.allocator()->init(output_gate_info);
-    _output_gate_sigmoid.configure(output_activation_input, &_output_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
+    _output_gate_sigmoid.configure(output_activation_input, &_output_gate,
+                                   ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC));
     output_activation_input->allocator()->allocate();
 
     // Hidden.
-    _hidden_tanh.configure(cell_state_out, &_input_gate, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
+    _hidden_tanh.configure(cell_state_out, &_input_gate,
+                           ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f));
     // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
     _memory_group.manage(&_hidden_mul_res);
     const TensorInfo hidden_mul_res(_input_gate.info()->tensor_shape(), 1, DataType::S32);
     _hidden_mul_res.allocator()->init(hidden_mul_res);
-    _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO);
+    _pixelwise_mul_hidden.configure(&_output_gate, &_input_gate, &_hidden_mul_res, 1.f, ConvertPolicy::SATURATE,
+                                    RoundingPolicy::TO_ZERO);
     _output_gate.allocator()->allocate();
     _input_gate.allocator()->allocate();
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
+    quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                 &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true);
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = output_state_in->info()->data_type();
 
@@ -598,7 +703,7 @@
 
     _memory_group.manage(&_hidden_gate);
 
-    if(_projection_tensor_copy_required)
+    if (_projection_tensor_copy_required)
     {
         _hidden_gate.allocator()->init(*output_state_out->info());
         _hidden_gate.info()->set_tensor_shape(_hidden_mul_res.info()->tensor_shape());
@@ -609,27 +714,26 @@
     _hidden_mul_res.allocator()->allocate();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         const TensorInfo              projection_outstage_info(*output_state_out->info());
-        const UniformQuantizationInfo qprojection      = _projection_weights->info()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        gemmlowp_info.gemmlowp_offset                  = qoutput_state_in.offset;
-        gemmlowp_info.gemmlowp_min_bound               = std::numeric_limits<int8_t>::lowest();
-        gemmlowp_info.gemmlowp_max_bound               = std::numeric_limits<int8_t>::max();
-        gemmlowp_info.output_data_type                 = DataType::QASYMM8_SIGNED;
+        const UniformQuantizationInfo qprojection = _projection_weights->info()->quantization_info().uniform();
+        const float projection_scale  = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        gemmlowp_info.gemmlowp_offset = qoutput_state_in.offset;
+        gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
+        gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
+        gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        configure_mm(_mm_projection, _projection_outstage, gemmlowp_info,
-                     hidden_gate_result, &_projection_weights_transposed, &_projection_eff_bias,
-                     &_mm_projection_res, &_projection_outstage_res, projection_scale,
-                     projection_mm_out_info, projection_outstage_info);
+        configure_mm(_mm_projection, _projection_outstage, gemmlowp_info, hidden_gate_result,
+                     &_projection_weights_transposed, &_projection_eff_bias, &_mm_projection_res,
+                     &_projection_outstage_res, projection_scale, projection_mm_out_info, projection_outstage_info);
 
         ITensor *accumulate_destination = output_state_out;
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_gate.allocator()->allocate();
             _projection_accumulate_res.allocator()->init(*output_state_in->info());
@@ -638,30 +742,34 @@
             accumulate_destination = &_projection_accumulate_res;
         }
 
-        _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination, ConvertPolicy::SATURATE);
+        _accumulate_projection.configure(&_projection_outstage_res, accumulate_destination, accumulate_destination,
+                                         ConvertPolicy::SATURATE);
         _projection_outstage_res.allocator()->allocate();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.configure(_projection_accumulate_res, *output_state_out);
             _projection_accumulate_res.allocator()->allocate();
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
-            quantized_projection_clip = utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
+            quantized_projection_clip =
+                utility::clamp<int8_t>(lstm_params.projection_clip() / qprojection.scale, -128, 127);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            _projection_clip.configure(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip, quantized_projection_clip));
+            _projection_clip.configure(output_state_out, nullptr,
+                                       ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                           -quantized_projection_clip, quantized_projection_clip));
             _has_projection_clipping = true;
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.configure(_hidden_gate, *output_state_out);
             _hidden_gate.allocator()->allocate();
@@ -672,17 +780,27 @@
     _copy_output.configure(output_state_out, output);
 }
 
-Status NEQLSTMLayer::validate(const ITensorInfo *input,
-                              const ITensorInfo *input_to_forget_weights, const ITensorInfo *input_to_cell_weights, const ITensorInfo *input_to_output_weights,
-                              const ITensorInfo *recurrent_to_forget_weights, const ITensorInfo *recurrent_to_cell_weights, const ITensorInfo *recurrent_to_output_weights,
-                              const ITensorInfo *forget_gate_bias, const ITensorInfo *cell_bias, const ITensorInfo *output_gate_bias,
-                              const ITensorInfo *cell_state_in, const ITensorInfo *output_state_in,
-                              const ITensorInfo *cell_state_out, const ITensorInfo *output_state_out, const ITensorInfo *output,
+Status NEQLSTMLayer::validate(const ITensorInfo             *input,
+                              const ITensorInfo             *input_to_forget_weights,
+                              const ITensorInfo             *input_to_cell_weights,
+                              const ITensorInfo             *input_to_output_weights,
+                              const ITensorInfo             *recurrent_to_forget_weights,
+                              const ITensorInfo             *recurrent_to_cell_weights,
+                              const ITensorInfo             *recurrent_to_output_weights,
+                              const ITensorInfo             *forget_gate_bias,
+                              const ITensorInfo             *cell_bias,
+                              const ITensorInfo             *output_gate_bias,
+                              const ITensorInfo             *cell_state_in,
+                              const ITensorInfo             *output_state_in,
+                              const ITensorInfo             *cell_state_out,
+                              const ITensorInfo             *output_state_out,
+                              const ITensorInfo             *output,
                               const LSTMParams<ITensorInfo> &lstm_params)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights,
-                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias, cell_state_in, output_state_in,
-                                        cell_state_out, output_state_out, output);
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
+                                        recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                        recurrent_to_output_weights, forget_gate_bias, cell_bias, output_gate_bias,
+                                        cell_state_in, output_state_in, cell_state_out, output_state_out, output);
 
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED);
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->num_dimensions() != 2, "Input must have exactly 2 dimensions");
@@ -694,22 +812,27 @@
 
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(input_to_output_weights->dimension(0) != input_size);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights, input_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_output_weights, input_to_forget_weights,
+                                                   input_to_cell_weights);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->num_dimensions() != 2);
     ARM_COMPUTE_RETURN_ERROR_ON(recurrent_to_output_weights->dimension(1) != num_units);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights, recurrent_to_cell_weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED, DataType::QSYMM8);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_output_weights, recurrent_to_forget_weights,
+                                                   recurrent_to_cell_weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_to_forget_weights, 1, DataType::QASYMM8_SIGNED,
+                                                         DataType::QSYMM8);
 
     // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
     if (input_to_forget_weights->data_type() == DataType::QSYMM8)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_cell_weights, input_to_output_weights,
-                                                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+                                                           recurrent_to_forget_weights, recurrent_to_cell_weights,
+                                                           recurrent_to_output_weights);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights, input_to_output_weights,
-                                                           recurrent_to_forget_weights, recurrent_to_cell_weights, recurrent_to_output_weights);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, input_to_cell_weights,
+                                                           input_to_output_weights, recurrent_to_forget_weights,
+                                                           recurrent_to_cell_weights, recurrent_to_output_weights);
     }
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->num_dimensions() != 1);
     ARM_COMPUTE_RETURN_ERROR_ON(forget_gate_bias->dimension(0) != num_units);
@@ -728,20 +851,25 @@
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_in);
 
     // Check whether peephole weights are all there or none
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
         ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->num_dimensions() != 1);
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_to_forget_weights()->dimension(0) != num_units);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_output_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                       lstm_params.cell_to_output_weights());
 
-        if(!lstm_params.has_cifg_opt())
+        if (!lstm_params.has_cifg_opt())
         {
             ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(), lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.cell_to_forget_weights(),
+                                                               lstm_params.cell_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(lstm_params.cell_to_forget_weights(),
+                                                           lstm_params.cell_to_input_weights());
         }
     }
 
@@ -755,7 +883,7 @@
 
     // Calculate quantized parameters for clipping.
     int16_t quantized_cell_clip = 0;
-    if(lstm_params.cell_clip() > 0.0f)
+    if (lstm_params.cell_clip() > 0.0f)
     {
         quantized_cell_clip = quantize_qsymm16(lstm_params.cell_clip(), qcell_state_in);
     }
@@ -763,60 +891,90 @@
     // Precompute effective bias for optimizing the matmul computations.
     const TensorInfo eff_bias_info(TensorShape(num_units), 1, DataType::S32);
     const TensorInfo projection_eff_bias_info(TensorShape(output_size), 1, DataType::S32);
-    if(!lstm_params.has_cifg_opt())
+    if (!lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.input_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                              -qinput.offset, true)));
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.recurrent_to_input_weights(), &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                              -qoutput_state_in.offset,
-                                                                                              true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.input_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.recurrent_to_input_weights(), &eff_bias_info,
+            GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
     }
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                          -qoutput_state_in.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset,
-                                                                                          true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(recurrent_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false,
-                                                                                          -qoutput_state_in.offset, true)));
-    if(lstm_params.has_projection())
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_forget_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_forget_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_cell_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_cell_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        input_to_output_weights, &eff_bias_info, GEMMLowpReductionKernelInfo(num_units, false, -qinput.offset, true)));
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+        recurrent_to_output_weights, &eff_bias_info,
+        GEMMLowpReductionKernelInfo(num_units, false, -qoutput_state_in.offset, true)));
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(lstm_params.projection_weights(), &projection_eff_bias_info, GEMMLowpReductionKernelInfo(output_size, false,
-                                                                                              lstm_params.hidden_state_zero(),
-                                                                                              true)));
-        if(lstm_params.projection_bias() != nullptr)
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmLowpMatrixAReductionKernel::validate(
+            lstm_params.projection_weights(), &projection_eff_bias_info,
+            GEMMLowpReductionKernelInfo(output_size, false, lstm_params.hidden_state_zero(), true)));
+        if (lstm_params.projection_bias() != nullptr)
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.projection_bias(), 1, DataType::S32);
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info, &projection_eff_bias_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEArithmeticAddition::validate(lstm_params.projection_bias(), &projection_eff_bias_info,
+                                               &projection_eff_bias_info, ConvertPolicy::SATURATE));
         }
     }
 
-    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(), input_to_cell_weights->quantization_info());
-    const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1, input_to_output_weights->data_type(), input_to_output_weights->quantization_info());
-    const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
-    const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_cell_weights->data_type(), recurrent_to_cell_weights->quantization_info());
-    const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_output_weights->data_type(), recurrent_to_output_weights->quantization_info());
-    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(), recurrent_to_forget_weights->quantization_info());
+    const TensorInfo input_weights_transposed(TensorShape(num_units, input_size), 1, input_to_cell_weights->data_type(),
+                                              input_to_cell_weights->quantization_info());
+    const TensorInfo input_to_output_weights_transposed(TensorShape(num_units, input_size), 1,
+                                                        input_to_output_weights->data_type(),
+                                                        input_to_output_weights->quantization_info());
+    const TensorInfo recurrent_to_forget_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                            recurrent_to_forget_weights->data_type(),
+                                                            recurrent_to_forget_weights->quantization_info());
+    const TensorInfo recurrent_to_cell_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                          recurrent_to_cell_weights->data_type(),
+                                                          recurrent_to_cell_weights->quantization_info());
+    const TensorInfo recurrent_to_output_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                            recurrent_to_output_weights->data_type(),
+                                                            recurrent_to_output_weights->quantization_info());
+    const TensorInfo recurrent_weights_transposed(TensorShape(num_units, output_size), 1,
+                                                  recurrent_to_forget_weights->data_type(),
+                                                  recurrent_to_forget_weights->quantization_info());
 
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_cell_weights, &input_weights_transposed));
     ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(input_to_output_weights, &input_to_output_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed));
-    ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed));
-    if(!lstm_params.has_cifg_opt())
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_forget_weights, &recurrent_to_forget_weights_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_cell_weights, &recurrent_to_cell_weights_transposed));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NETranspose::validate(recurrent_to_output_weights, &recurrent_to_output_weights_transposed));
+    if (!lstm_params.has_cifg_opt())
     {
-        const TensorInfo recurrent_to_input_weights_transposed(TensorShape(num_units, output_size), 1,
-                                                               recurrent_to_forget_weights->data_type(), lstm_params.recurrent_to_input_weights()->quantization_info());
+        const TensorInfo recurrent_to_input_weights_transposed(
+            TensorShape(num_units, output_size), 1, recurrent_to_forget_weights->data_type(),
+            lstm_params.recurrent_to_input_weights()->quantization_info());
         const TensorInfo input_to_input_weights_transposed(TensorShape(num_units, input_size), 1,
-                                                           lstm_params.input_to_input_weights()->data_type(), lstm_params.input_to_input_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed));
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed));
+                                                           lstm_params.input_to_input_weights()->data_type(),
+                                                           lstm_params.input_to_input_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.input_to_input_weights(), &input_to_input_weights_transposed));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.recurrent_to_input_weights(), &recurrent_to_input_weights_transposed));
     }
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NETranspose::validate(lstm_params.projection_weights(), &projection_weights_transposed));
     }
 
     GEMMLowpOutputStageInfo gemmlowp_info;
@@ -829,28 +987,42 @@
 
     // Forget gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.forget_intermediate_scale() == 0);
-    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
+    const TensorInfo forget_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.forget_intermediate_scale(), 0));
     const TensorInfo mm_out_info(TensorShape(num_units, batch_size), 1, DataType::S32);
-    const float      input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float input_to_forget_scale = input_to_forget_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_forget_scale, &mm_out_info, &forget_outstage_info));
 
-    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_forget_scale, &mm_out_info, &forget_outstage_info));
+    const float recurrent_to_forget_scale = recurrent_to_forget_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.forget_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_forget_scale, &mm_out_info,
+                                            &forget_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                               &forget_outstage_info, ConvertPolicy::SATURATE));
 
-    if(lstm_params.has_peephole_opt())
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1, DataType::QSYMM16);
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        const float cell_to_forget_scale = std::pow(2, cell_shift) * lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale / lstm_params.forget_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info, &forget_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_forget_weights(), 1,
+                                                             DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &mm_out_info, 1.f,
+                                                ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+        const float cell_to_forget_scale = std::pow(2, cell_shift) *
+                                           lstm_params.cell_to_forget_weights()->quantization_info().uniform().scale /
+                                           lstm_params.forget_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            cell_to_forget_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEGEMMLowpOutputStage::validate(&mm_out_info, nullptr, &forget_outstage_info, gemmlowp_info));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_outstage_info, &forget_outstage_info,
+                                                                   &forget_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.forget_layer_norm_weights();
         const ITensorInfo *b_info = forget_gate_bias;
@@ -859,22 +1031,31 @@
 
     // Output quantization info of Sigmoid and Tanh activations
     const QuantizationInfo sigmoid_tanh_outqinfo(1.f / 32768.f, 0);
-    const TensorInfo       forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
+    const TensorInfo forget_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&forget_outstage_info, &forget_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Modulation gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.cell_intermediate_scale() == 0);
-    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
-    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_cell_scale, &mm_out_info, &cell_outstage_info));
+    const TensorInfo cell_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                        QuantizationInfo(lstm_params.cell_intermediate_scale(), 0));
+    const float      input_to_cell_scale = input_to_cell_weights->quantization_info().uniform().scale * qinput.scale /
+                                      lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_cell_scale, &mm_out_info, &cell_outstage_info));
 
-    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_cell_scale, &mm_out_info, &cell_outstage_info));
+    const float recurrent_to_cell_scale = recurrent_to_cell_weights->quantization_info().uniform().scale *
+                                          qoutput_state_in.scale / lstm_params.cell_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_cell_scale, &mm_out_info,
+                                            &cell_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info, &cell_outstage_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&cell_outstage_info, &cell_outstage_info,
+                                                               &cell_outstage_info, ConvertPolicy::SATURATE));
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.cell_layer_norm_weights();
         const ITensorInfo *b_info = cell_bias;
@@ -882,94 +1063,134 @@
     }
     const TensorInfo cell_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&cell_outstage_info, &cell_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
 
     // Input gate.
     const TensorInfo input_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    if(lstm_params.has_cifg_opt())
+    if (lstm_params.has_cifg_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr, "Input gate bias must not be present when CIFG is used");
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info, &forget_gate_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(lstm_params.input_gate_bias() != nullptr,
+                                        "Input gate bias must not be present when CIFG is used");
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticSubtraction::validate(&input_gate_info, &forget_gate_info,
+                                                                      &forget_gate_info, ConvertPolicy::SATURATE));
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
+        ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.input_to_input_weights(),
+                                            lstm_params.recurrent_to_input_weights(), lstm_params.input_gate_bias());
 
         // If the input_to_forget_weights data type is DataType::QSYMM8 then it can never match the other weights as they are all DataType::QASYMM8_SIGNED
         if (input_to_forget_weights->data_type() == DataType::QSYMM8)
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lstm_params.input_to_input_weights(),
+                                                               lstm_params.recurrent_to_input_weights());
         }
         else
         {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights, lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights());
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input_to_forget_weights,
+                                                               lstm_params.input_to_input_weights(),
+                                                               lstm_params.recurrent_to_input_weights());
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input_to_forget_weights, lstm_params.input_to_input_weights());
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights, lstm_params.recurrent_to_input_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(recurrent_to_forget_weights,
+                                                       lstm_params.recurrent_to_input_weights());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(forget_gate_bias, lstm_params.input_gate_bias());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(forget_gate_bias, lstm_params.input_gate_bias());
 
         ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_intermediate_scale() == 0);
-        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
-        const float      input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale * qinput.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_input_scale, &mm_out_info, &input_outstage_info));
+        const TensorInfo input_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                             QuantizationInfo(lstm_params.input_intermediate_scale(), 0));
+        const float input_to_input_scale = lstm_params.input_to_input_weights()->quantization_info().uniform().scale *
+                                           qinput.scale / lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                                input_to_input_scale, &mm_out_info, &input_outstage_info));
 
-        const float recurrent_to_input_scale = lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.input_intermediate_scale();
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_input_scale, &mm_out_info, &input_outstage_info));
+        const float recurrent_to_input_scale =
+            lstm_params.recurrent_to_input_weights()->quantization_info().uniform().scale * qoutput_state_in.scale /
+            lstm_params.input_intermediate_scale();
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                                &eff_bias_info, recurrent_to_input_scale, &mm_out_info,
+                                                &input_outstage_info));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                   &input_outstage_info, ConvertPolicy::SATURATE));
 
-        if(lstm_params.has_peephole_opt())
+        if (lstm_params.has_peephole_opt())
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info, 1.f, ConvertPolicy::SATURATE,
-                                                                            RoundingPolicy::TO_ZERO));
-            const float cell_to_input_scale = std::pow(2, cell_shift) * lstm_params.cell_to_input_weights()->quantization_info().uniform().scale / lstm_params.input_intermediate_scale();
-            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
-            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info, &input_outstage_info, ConvertPolicy::SATURATE));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEPixelWiseMultiplication::validate(cell_state_in, lstm_params.cell_to_input_weights(), &mm_out_info,
+                                                    1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+            const float cell_to_input_scale = std::pow(2, cell_shift) *
+                                              lstm_params.cell_to_input_weights()->quantization_info().uniform().scale /
+                                              lstm_params.input_intermediate_scale();
+            ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+                cell_to_input_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEGEMMLowpOutputStage::validate(&mm_out_info, &eff_bias_info, &input_outstage_info, gemmlowp_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&input_outstage_info, &input_outstage_info,
+                                                                       &input_outstage_info, ConvertPolicy::SATURATE));
         }
 
-        if(has_layer_norm)
+        if (has_layer_norm)
         {
             const ITensorInfo *w_info = lstm_params.input_layer_norm_weights();
             const ITensorInfo *b_info = lstm_params.input_gate_bias();
             ARM_COMPUTE_RETURN_ON_ERROR(validate_layer_norm(input_outstage_info, *w_info, *b_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&input_outstage_info, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(&input_outstage_info, &input_gate_info,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     }
     // Cell.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
-    if(quantized_cell_clip > 0)
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &forget_gate_info, cell_state_in, &forget_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &input_gate_info, cell_state_in, &cell_gate_info, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&forget_gate_info, &cell_gate_info, cell_state_out, ConvertPolicy::SATURATE));
+    if (quantized_cell_clip > 0)
     {
-        ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_cell_clip,
-                                                                                                             quantized_cell_clip)));
+        ARM_COMPUTE_RETURN_ON_ERROR(
+            NEActivationLayer::validate(cell_state_out, nullptr,
+                                        ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                                            -quantized_cell_clip, quantized_cell_clip)));
     }
     // Output gate.
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.output_intermediate_scale() == 0);
-    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
-    const float      input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info, input_to_output_scale, &mm_out_info, &output_outstage_info));
+    const TensorInfo output_outstage_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16,
+                                          QuantizationInfo(lstm_params.output_intermediate_scale(), 0));
+    const float input_to_output_scale = input_to_output_weights->quantization_info().uniform().scale * qinput.scale /
+                                        lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, input, &input_weights_transposed, &eff_bias_info,
+                                            input_to_output_scale, &mm_out_info, &output_outstage_info));
 
-    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale * qoutput_state_in.scale / lstm_params.output_intermediate_scale();
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed, &eff_bias_info, recurrent_to_output_scale, &mm_out_info, &output_outstage_info));
+    const float recurrent_to_output_scale = recurrent_to_output_weights->quantization_info().uniform().scale *
+                                            qoutput_state_in.scale / lstm_params.output_intermediate_scale();
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, output_state_in, &recurrent_weights_transposed,
+                                            &eff_bias_info, recurrent_to_output_scale, &mm_out_info,
+                                            &output_outstage_info));
 
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
-    if(lstm_params.has_peephole_opt())
+    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                               &output_outstage_info, ConvertPolicy::SATURATE));
+    if (lstm_params.has_peephole_opt())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1, DataType::QSYMM16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lstm_params.cell_to_output_weights(), 1,
+                                                             DataType::QSYMM16);
         // TODO(COMPMID-3395): Perform multiplication in the quantized domain in NEPixelWiseMultiplication
         // Here we are not using the output stage because all operations are done in float
         // const float cell_to_output_scale = std::pow(2, cell_shift) * lstm_params.cell_to_output_weights()->quantization_info().uniform().scale / lstm_params.output_intermediate_scale();
         // ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(cell_to_output_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
-                                                                        RoundingPolicy::TO_ZERO));
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info, &output_outstage_info, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+            cell_state_out, lstm_params.cell_to_output_weights(), &output_outstage_info, 1.f, ConvertPolicy::SATURATE,
+            RoundingPolicy::TO_ZERO));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&output_outstage_info, &output_outstage_info,
+                                                                   &output_outstage_info, ConvertPolicy::SATURATE));
     }
 
-    if(has_layer_norm)
+    if (has_layer_norm)
     {
         const ITensorInfo *w_info = lstm_params.output_layer_norm_weights();
         const ITensorInfo *b_info = output_gate_bias;
@@ -977,85 +1198,103 @@
     }
 
     const TensorInfo output_gate_info(TensorShape(num_units, batch_size), 1, DataType::QSYMM16, sigmoid_tanh_outqinfo);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&output_outstage_info, &output_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(&output_outstage_info, &output_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LOGISTIC)));
 
     // Hidden.
-    ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(cell_state_out, &input_gate_info, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEActivationLayer::validate(cell_state_out, &input_gate_info,
+                                    ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::TANH, 1.f, 1.f)));
     const TensorInfo hidden_mul_res(TensorShape(num_units, batch_size), 1, DataType::S32);
     const TensorInfo hidden_out_info(TensorShape(num_units, batch_size), 1, DataType::QASYMM8_SIGNED);
-    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(&output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
+    ARM_COMPUTE_RETURN_ON_ERROR(NEPixelWiseMultiplication::validate(
+        &output_gate_info, &input_gate_info, &hidden_mul_res, 1.f, ConvertPolicy::SATURATE, RoundingPolicy::TO_ZERO));
 
     ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.hidden_state_scale() == 0);
     const float hidden_state_scale = std::pow(2, -15) / lstm_params.hidden_state_scale() * std::pow(2, -15);
-    ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        quantization::calculate_quantized_multiplier(hidden_state_scale, &gemmlowp_info.gemmlowp_multiplier,
+                                                     &gemmlowp_info.gemmlowp_shift, /* ignore_epsilon */ true));
     gemmlowp_info.gemmlowp_offset  = lstm_params.hidden_state_zero();
     gemmlowp_info.output_data_type = hidden_out_info.data_type();
-    ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEGEMMLowpOutputStage::validate(&hidden_mul_res, nullptr, &hidden_out_info, gemmlowp_info));
 
     const bool projection_tensor_copy_required = num_units != output_size;
 
     // Projection.
-    if(lstm_params.has_projection())
+    if (lstm_params.has_projection())
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights, lstm_params.projection_weights());
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(recurrent_to_forget_weights,
+                                                           lstm_params.projection_weights());
         ARM_COMPUTE_RETURN_ERROR_ON(qoutput_state_in.scale == 0);
 
-        const UniformQuantizationInfo qprojection      = lstm_params.projection_weights()->quantization_info().uniform();
-        const float                   projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
-        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
+        const UniformQuantizationInfo qprojection = lstm_params.projection_weights()->quantization_info().uniform();
+        const float projection_scale = qprojection.scale * lstm_params.hidden_state_scale() / qoutput_state_in.scale;
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(
+            projection_scale, &gemmlowp_info.gemmlowp_multiplier, &gemmlowp_info.gemmlowp_shift));
         gemmlowp_info.gemmlowp_offset    = qoutput_state_in.offset;
         gemmlowp_info.gemmlowp_min_bound = std::numeric_limits<int8_t>::lowest();
         gemmlowp_info.gemmlowp_max_bound = std::numeric_limits<int8_t>::max();
         gemmlowp_info.output_data_type   = DataType::QASYMM8_SIGNED;
 
         const TensorInfo projection_outstage_info(*output_state_out);
-        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1, lstm_params.projection_weights()->data_type(), lstm_params.projection_weights()->quantization_info());
+        const TensorInfo projection_weights_transposed(TensorShape(output_size, num_units), 1,
+                                                       lstm_params.projection_weights()->data_type(),
+                                                       lstm_params.projection_weights()->quantization_info());
 
-        TensorInfo projection_mm_out_info{ mm_out_info };
+        TensorInfo projection_mm_out_info{mm_out_info};
         projection_mm_out_info.set_tensor_shape(TensorShape(output_size, batch_size));
 
-        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed, &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
+        ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemmlowp_info, &hidden_out_info, &projection_weights_transposed,
+                                                &projection_eff_bias_info, projection_scale, &projection_mm_out_info,
                                                 &projection_outstage_info));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEQLSTMLayer::TensorCopyKernel::validate(*output_state_in, projection_outstage_info));
         }
 
-        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out, ConvertPolicy::SATURATE));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(output_state_out, output_state_out, output_state_out,
+                                                                   ConvertPolicy::SATURATE));
 
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
+            ARM_COMPUTE_RETURN_ON_ERROR(
+                NEQLSTMLayer::TensorCopyKernel::validate(projection_outstage_info, *output_state_out));
         }
 
-        int8_t quantized_projection_clip{ 0 };
-        if(lstm_params.projection_clip() > 0.0f)
+        int8_t quantized_projection_clip{0};
+        if (lstm_params.projection_clip() > 0.0f)
         {
             quantized_projection_clip = quantize_qasymm8_signed(lstm_params.projection_clip(), qprojection);
         }
 
-        if(quantized_projection_clip > 0)
+        if (quantized_projection_clip > 0)
         {
-            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(output_state_out, nullptr, ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, -quantized_projection_clip,
-                                                                                                                   quantized_projection_clip)));
+            ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(
+                output_state_out, nullptr,
+                ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU,
+                                    -quantized_projection_clip, quantized_projection_clip)));
         }
     }
     else
     {
-        if(projection_tensor_copy_required)
+        if (projection_tensor_copy_required)
         {
             ARM_COMPUTE_RETURN_ON_ERROR(NEQLSTMLayer::TensorCopyKernel::validate(hidden_out_info, *output_state_out));
         }
     }
 
-    if(cell_state_out->total_size() > 0)
+    if (cell_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(cell_state_in, cell_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(cell_state_in, cell_state_out);
     }
 
-    if(output_state_out->total_size() > 0)
+    if (output_state_out->total_size() > 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output_state_out);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output_state_in, output_state_out);
@@ -1080,14 +1319,14 @@
     _recurrent_to_forget_outstage.run();
     _accumulate_input_recurrent_forget.run();
 
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_forget.run();
         _cell_to_forget_outstage.run();
         _accumulate_cell_forget.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Forget).get(), Window::DimY);
     }
@@ -1102,7 +1341,7 @@
     _recurrent_to_cell_outstage.run();
     _accumulate_input_recurrent_modulation.run();
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Cell).get(), Window::DimY);
     }
@@ -1110,7 +1349,7 @@
     _cell_gate_tanh.run();
 
     // Input gate
-    if(_has_cifg)
+    if (_has_cifg)
     {
         _input_gate_sub.run();
     }
@@ -1122,14 +1361,14 @@
         _recurrent_to_input_outstage.run();
         _accumulate_input_recurrent_input.run();
 
-        if(_has_peephole)
+        if (_has_peephole)
         {
             _pixelwise_mul_cell_to_input.run();
             _cell_to_input_outstage.run();
             _accumulate_cell_input.run();
         }
 
-        if(_has_layer_norm)
+        if (_has_layer_norm)
         {
             NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Input).get(), Window::DimY);
         }
@@ -1142,7 +1381,7 @@
     _pixelwise_mul_input_cell.run();
     _add_forget_cell.run();
 
-    if(_has_cell_clipping)
+    if (_has_cell_clipping)
     {
         _cell_clip.run();
     }
@@ -1153,14 +1392,14 @@
     _mm_recurrent_to_output.run();
     _recurrent_to_output_outstage.run();
     _accumulate_input_recurrent_output.run();
-    if(_has_peephole)
+    if (_has_peephole)
     {
         _pixelwise_mul_cell_to_output.run();
         _cell_to_output_outstage.run();
         _accumulate_cell_to_output.run();
     }
 
-    if(_has_layer_norm)
+    if (_has_layer_norm)
     {
         NEScheduler::get().schedule(get_layer_norm(LayerNormGate::Output).get(), Window::DimY);
     }
@@ -1173,31 +1412,31 @@
     _hidden_outstage.run();
 
     // Projection.
-    if(_has_projection)
+    if (_has_projection)
     {
         _mm_projection.run();
         _projection_outstage.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_output_to_accumulate_copy.run();
         }
 
         _accumulate_projection.run();
 
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _projection_accumulate_to_output_copy.run();
         }
 
-        if(_has_projection_clipping)
+        if (_has_projection_clipping)
         {
             _projection_clip.run();
         }
     }
     else
     {
-        if(_projection_tensor_copy_required)
+        if (_projection_tensor_copy_required)
         {
             _hidden_to_output_copy.run();
         }
@@ -1209,9 +1448,9 @@
 
 void NEQLSTMLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
-        if(_convert_input_to_forget_weights_to_qsymm8)
+        if (_convert_input_to_forget_weights_to_qsymm8)
         {
             _input_to_forget_weights_f32.allocator()->allocate();
             _input_to_forget_weights_symm8.allocator()->allocate();
@@ -1234,28 +1473,25 @@
         _transpose_recurrent_to_output_weights.run();
 
         // Precompute effective biases
-        if(_has_cifg)
+        if (_has_cifg)
         {
-            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 32767);
+            std::fill_n(reinterpret_cast<int16_t *>(_ones.buffer()),
+                        _ones.info()->total_size() / _ones.info()->element_size(), 32767);
         }
         else
         {
             _input_to_input_eff_bias.allocator()->allocate();
             _recurrent_to_input_eff_bias.allocator()->allocate();
 
-            ITensorPack packII =
-            {
-                { TensorType::ACL_SRC, _input_to_input_weights },
-                { TensorType::ACL_DST, &_input_to_input_eff_bias }
-            };
-            NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY, _input_to_input_reduction->window(), packII);
+            ITensorPack packII = {{TensorType::ACL_SRC, _input_to_input_weights},
+                                  {TensorType::ACL_DST, &_input_to_input_eff_bias}};
+            NEScheduler::get().schedule_op(_input_to_input_reduction.get(), Window::DimY,
+                                           _input_to_input_reduction->window(), packII);
 
-            ITensorPack packRI =
-            {
-                { TensorType::ACL_SRC, _recurrent_to_input_weights },
-                { TensorType::ACL_DST, &_recurrent_to_input_eff_bias }
-            };
-            NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY, _recurrent_to_input_reduction->window(), packRI);
+            ITensorPack packRI = {{TensorType::ACL_SRC, _recurrent_to_input_weights},
+                                  {TensorType::ACL_DST, &_recurrent_to_input_eff_bias}};
+            NEScheduler::get().schedule_op(_recurrent_to_input_reduction.get(), Window::DimY,
+                                           _recurrent_to_input_reduction->window(), packRI);
 
             _input_to_input_weights_transposed.allocator()->allocate();
             _recurrent_to_input_weights_transposed.allocator()->allocate();
@@ -1271,58 +1507,44 @@
         _input_to_output_eff_bias.allocator()->allocate();
         _recurrent_to_output_eff_bias.allocator()->allocate();
 
-        ITensorPack packIF =
-        {
-            { TensorType::ACL_SRC, _input_to_forget_weights },
-            { TensorType::ACL_DST, &_input_to_forget_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY, _input_to_forget_reduction->window(), packIF);
+        ITensorPack packIF = {{TensorType::ACL_SRC, _input_to_forget_weights},
+                              {TensorType::ACL_DST, &_input_to_forget_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_forget_reduction.get(), Window::DimY,
+                                       _input_to_forget_reduction->window(), packIF);
 
-        ITensorPack packRF =
-        {
-            { TensorType::ACL_SRC, _recurrent_to_forget_weights },
-            { TensorType::ACL_DST, &_recurrent_to_forget_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY, _recurrent_to_forget_reduction->window(), packRF);
+        ITensorPack packRF = {{TensorType::ACL_SRC, _recurrent_to_forget_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_forget_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_forget_reduction.get(), Window::DimY,
+                                       _recurrent_to_forget_reduction->window(), packRF);
 
-        ITensorPack packIC =
-        {
-            { TensorType::ACL_SRC, _input_to_cell_weights },
-            { TensorType::ACL_DST, &_input_to_cell_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(), packIC);
+        ITensorPack packIC = {{TensorType::ACL_SRC, _input_to_cell_weights},
+                              {TensorType::ACL_DST, &_input_to_cell_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_cell_reduction.get(), Window::DimY, _input_to_cell_reduction->window(),
+                                       packIC);
 
-        ITensorPack packRC =
-        {
-            { TensorType::ACL_SRC, _recurrent_to_cell_weights },
-            { TensorType::ACL_DST, &_recurrent_to_cell_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY, _recurrent_to_cell_reduction->window(), packRC);
+        ITensorPack packRC = {{TensorType::ACL_SRC, _recurrent_to_cell_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_cell_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_cell_reduction.get(), Window::DimY,
+                                       _recurrent_to_cell_reduction->window(), packRC);
 
-        ITensorPack packIO =
-        {
-            { TensorType::ACL_SRC, _input_to_output_weights },
-            { TensorType::ACL_DST, &_input_to_output_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY, _input_to_output_reduction->window(), packIO);
+        ITensorPack packIO = {{TensorType::ACL_SRC, _input_to_output_weights},
+                              {TensorType::ACL_DST, &_input_to_output_eff_bias}};
+        NEScheduler::get().schedule_op(_input_to_output_reduction.get(), Window::DimY,
+                                       _input_to_output_reduction->window(), packIO);
 
-        ITensorPack packRO =
-        {
-            { TensorType::ACL_SRC, _recurrent_to_output_weights },
-            { TensorType::ACL_DST, &_recurrent_to_output_eff_bias }
-        };
-        NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY, _recurrent_to_output_reduction->window(), packRO);
+        ITensorPack packRO = {{TensorType::ACL_SRC, _recurrent_to_output_weights},
+                              {TensorType::ACL_DST, &_recurrent_to_output_eff_bias}};
+        NEScheduler::get().schedule_op(_recurrent_to_output_reduction.get(), Window::DimY,
+                                       _recurrent_to_output_reduction->window(), packRO);
 
-        if(_has_projection)
+        if (_has_projection)
         {
             _projection_eff_bias.allocator()->allocate();
-            ITensorPack pack =
-            {
-                { TensorType::ACL_SRC, _projection_weights },
-                { TensorType::ACL_DST, &_projection_eff_bias }
-            };
-            NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(), pack);
-            if(_projection_bias != nullptr)
+            ITensorPack pack = {{TensorType::ACL_SRC, _projection_weights},
+                                {TensorType::ACL_DST, &_projection_eff_bias}};
+            NEScheduler::get().schedule_op(_projection_reduction.get(), Window::DimY, _projection_reduction->window(),
+                                           pack);
+            if (_projection_bias != nullptr)
             {
                 _projection_bias_add.run();
                 _projection_bias->mark_as_unused();
@@ -1332,7 +1554,7 @@
             _transpose_projection_weights.run();
             _projection_weights->mark_as_unused();
 
-            if(!_projection_tensor_copy_required)
+            if (!_projection_tensor_copy_required)
             {
                 _hidden_gate.mark_as_unused();
                 _projection_accumulate_res.mark_as_unused();

diff --git a/src/runtime/NEON/functions/NEQuantizationLayer.cpp b/src/runtime/NEON/functions/NEQuantizationLayer.cpp
index dad246a..9b72783 100644
--- a/src/runtime/NEON/functions/NEQuantizationLayer.cpp
+++ b/src/runtime/NEON/functions/NEQuantizationLayer.cpp

@@ -26,19 +26,19 @@
 
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/cpu/operators/CpuQuantize.h"
 
 namespace arm_compute
 {
 struct NEQuantizationLayer::Impl
 {
-    const ITensor                    *src{ nullptr };
-    ITensor                          *dst{ nullptr };
-    std::unique_ptr<cpu::CpuQuantize> op{ nullptr };
+    const ITensor                    *src{nullptr};
+    ITensor                          *dst{nullptr};
+    std::unique_ptr<cpu::CpuQuantize> op{nullptr};
 };
 
-NEQuantizationLayer::NEQuantizationLayer()
-    : _impl(std::make_unique<Impl>())
+NEQuantizationLayer::NEQuantizationLayer() : _impl(std::make_unique<Impl>())
 {
 }
 NEQuantizationLayer::~NEQuantizationLayer() = default;

diff --git a/src/runtime/NEON/functions/NERNNLayer.cpp b/src/runtime/NEON/functions/NERNNLayer.cpp
index a66ef3d..2824693 100644
--- a/src/runtime/NEON/functions/NERNNLayer.cpp
+++ b/src/runtime/NEON/functions/NERNNLayer.cpp

@@ -27,9 +27,10 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -37,13 +38,26 @@
 NERNNLayer::~NERNNLayer() = default;
 
 NERNNLayer::NERNNLayer(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _gemm_state_f(), _add_f(), _activation(), _fully_connected(memory_manager), _copy_f(), _fully_connected_out(), _gemm_output(), _add_output(),
+    : _memory_group(std::move(memory_manager)),
+      _gemm_state_f(),
+      _add_f(),
+      _activation(),
+      _fully_connected(memory_manager),
+      _copy_f(),
+      _fully_connected_out(),
+      _gemm_output(),
+      _add_output(),
       _is_prepared(false)
 {
 }
 
-Status NERNNLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *recurrent_weights, const ITensorInfo *bias, const ITensorInfo *hidden_state,
-                            const ITensorInfo *output, const ActivationLayerInfo &info)
+Status NERNNLayer::validate(const ITensorInfo         *input,
+                            const ITensorInfo         *weights,
+                            const ITensorInfo         *recurrent_weights,
+                            const ITensorInfo         *bias,
+                            const ITensorInfo         *hidden_state,
+                            const ITensorInfo         *output,
+                            const ActivationLayerInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F16, DataType::F32);
@@ -60,24 +74,34 @@
     ARM_COMPUTE_RETURN_ERROR_ON(hidden_state->dimension(idx_height) != input->dimension(idx_height));
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), hidden_state->tensor_shape());
 
-    auto shape_info = TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1, input->data_type());
+    auto shape_info =
+        TensorInfo(misc::shape_calculator::compute_rnn_shape(recurrent_weights, hidden_state->dimension(idx_height)), 1,
+                   input->data_type());
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEFullyConnectedLayer::validate(input, weights, bias, &shape_info));
-    ARM_COMPUTE_RETURN_ON_ERROR(NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NEArithmeticAddition::validate(&shape_info, &shape_info, &shape_info, ConvertPolicy::SATURATE));
     ARM_COMPUTE_RETURN_ON_ERROR(NEActivationLayer::validate(&shape_info, &shape_info, info));
 
     return Status{};
 }
 
-void NERNNLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *recurrent_weights, const ITensor *bias, ITensor *hidden_state, ITensor *output,
+void NERNNLayer::configure(const ITensor       *input,
+                           const ITensor       *weights,
+                           const ITensor       *recurrent_weights,
+                           const ITensor       *bias,
+                           ITensor             *hidden_state,
+                           ITensor             *output,
                            ActivationLayerInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, recurrent_weights, bias, hidden_state, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(), bias->info(), hidden_state->info(), output->info(), info));
+    ARM_COMPUTE_ERROR_THROW_ON(NERNNLayer::validate(input->info(), weights->info(), recurrent_weights->info(),
+                                                    bias->info(), hidden_state->info(), output->info(), info));
     ARM_COMPUTE_LOG_PARAMS(input, weights, recurrent_weights, bias, hidden_state, output, info);
 
     const int   idx_height = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::HEIGHT);
-    TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(), hidden_state->info()->dimension(idx_height));
+    TensorShape shape      = misc::shape_calculator::compute_rnn_shape(recurrent_weights->info(),
+                                                                       hidden_state->info()->dimension(idx_height));
 
     _is_prepared = false;
 
@@ -125,7 +149,7 @@
 
 void NERNNLayer::prepare()
 {
-    if(!_is_prepared)
+    if (!_is_prepared)
     {
         _fully_connected.prepare();
         _gemm_state_f.prepare();

diff --git a/src/runtime/NEON/functions/NEROIAlignLayer.cpp b/src/runtime/NEON/functions/NEROIAlignLayer.cpp
index a9bdb50..68bb5d5 100644
--- a/src/runtime/NEON/functions/NEROIAlignLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIAlignLayer.cpp

@@ -29,14 +29,20 @@
 
 namespace arm_compute
 {
-Status NEROIAlignLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIAlignLayer::validate(const ITensorInfo         *input,
+                                 const ITensorInfo         *rois,
+                                 ITensorInfo               *output,
+                                 const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NEROIAlignLayerKernel::validate(input, rois, output, pool_info));
 
     return Status{};
 }
 
-void NEROIAlignLayer::configure(const ITensor *input, const ITensor *rois, ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIAlignLayer::configure(const ITensor             *input,
+                                const ITensor             *rois,
+                                ITensor                   *output,
+                                const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 

diff --git a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
index a24f2aa..babec4a 100644
--- a/src/runtime/NEON/functions/NEROIPoolingLayer.cpp
+++ b/src/runtime/NEON/functions/NEROIPoolingLayer.cpp

@@ -22,8 +22,10 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NEROIPoolingLayer.h"
+
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEROIPoolingLayerKernel.h"
 
@@ -31,17 +33,22 @@
 {
 NEROIPoolingLayer::~NEROIPoolingLayer() = default;
 
-NEROIPoolingLayer::NEROIPoolingLayer()
-    : _roi_kernel()
+NEROIPoolingLayer::NEROIPoolingLayer() : _roi_kernel()
 {
 }
 
-Status NEROIPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *rois, const ITensorInfo *output, const ROIPoolingLayerInfo &pool_info)
+Status NEROIPoolingLayer::validate(const ITensorInfo         *input,
+                                   const ITensorInfo         *rois,
+                                   const ITensorInfo         *output,
+                                   const ROIPoolingLayerInfo &pool_info)
 {
     return NEROIPoolingLayerKernel::validate(input, rois, output, pool_info);
 }
 
-void NEROIPoolingLayer::configure(const ITensor *input, const ITensor *rois, const ITensor *output, const ROIPoolingLayerInfo &pool_info)
+void NEROIPoolingLayer::configure(const ITensor             *input,
+                                  const ITensor             *rois,
+                                  const ITensor             *output,
+                                  const ROIPoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_LOG_PARAMS(input, rois, output, pool_info);
 
@@ -53,4 +60,4 @@
 {
     NEScheduler::get().schedule(_roi_kernel.get(), Window::DimX);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NERange.cpp b/src/runtime/NEON/functions/NERange.cpp
index a6f7be8..95492df 100644
--- a/src/runtime/NEON/functions/NERange.cpp
+++ b/src/runtime/NEON/functions/NERange.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NERange.h"
 
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NERangeKernel.h"
 
@@ -31,8 +32,7 @@
 {
 NERange::~NERange() = default;
 
-NERange::NERange()
-    : _kernel()
+NERange::NERange() : _kernel()
 {
 }
 
@@ -52,4 +52,4 @@
 {
     NEScheduler::get().schedule(_kernel.get(), Window::DimX);
 }
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NEReduceMean.cpp b/src/runtime/NEON/functions/NEReduceMean.cpp
index 9f96479..d37cf4a 100644
--- a/src/runtime/NEON/functions/NEReduceMean.cpp
+++ b/src/runtime/NEON/functions/NEReduceMean.cpp

@@ -25,21 +25,24 @@
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/CPP/Validate.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 namespace arm_compute
 {
 namespace
 {
-Status validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status
+validate_config(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
 {
     ARM_COMPUTE_UNUSED(keep_dims);
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8,
+                                                         DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() < 1);
     ARM_COMPUTE_RETURN_ERROR_ON(reduction_axis.num_dimensions() > input->num_dimensions());
 
@@ -47,29 +50,29 @@
     const int          input_dims    = input->num_dimensions();
     Coordinates        axis_local    = reduction_axis;
 
-    for(unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
+    for (unsigned int i = 0; i < axis_local.num_dimensions(); ++i)
     {
         //axis: The dimensions to reduce. Must be in the range [-rank(input_tensor), rank(input_tensor)).
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] < (-static_cast<int>(input->num_dimensions())));
         ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] >= static_cast<int>(input->num_dimensions()));
     }
 
-    if(output->tensor_shape().total_size() != 0)
+    if (output->tensor_shape().total_size() != 0)
     {
         // Only validate if not using auto_init for the output tensor
         TensorShape out_shape = input->tensor_shape();
         // Validate output_shape only if not using auto_init
         convert_negative_axis(axis_local, input_dims);
         std::sort(axis_local.begin(), axis_local.begin() + reduction_ops);
-        for(unsigned int i = 0; i < reduction_ops; ++i)
+        for (unsigned int i = 0; i < reduction_ops; ++i)
         {
             ARM_COMPUTE_RETURN_ERROR_ON(axis_local[i] > 3);
             ARM_COMPUTE_RETURN_ERROR_ON(static_cast<unsigned int>(axis_local[i]) > input->num_dimensions() - 1);
-            if(output->total_size() > 0 && keep_dims)
+            if (output->total_size() > 0 && keep_dims)
             {
                 ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(axis_local[i]) != 1);
             }
-            if(keep_dims)
+            if (keep_dims)
             {
                 out_shape.set(axis_local[i], 1);
             }
@@ -91,11 +94,19 @@
 NEReduceMean::~NEReduceMean() = default;
 
 NEReduceMean::NEReduceMean(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(std::move(memory_manager)), _reduction_kernels(), _reduced_outs(), _reshape(), _reduction_ops(), _keep_dims()
+    : _memory_group(std::move(memory_manager)),
+      _reduction_kernels(),
+      _reduced_outs(),
+      _reshape(),
+      _reduction_ops(),
+      _keep_dims()
 {
 }
 
-Status NEReduceMean::validate(const ITensorInfo *input, const Coordinates &reduction_axis, bool keep_dims, const ITensorInfo *output)
+Status NEReduceMean::validate(const ITensorInfo *input,
+                              const Coordinates &reduction_axis,
+                              bool               keep_dims,
+                              const ITensorInfo *output)
 {
     return validate_config(input, reduction_axis, keep_dims, output);
 }
@@ -107,7 +118,8 @@
     // Perform validate step
     ARM_COMPUTE_ERROR_THROW_ON(NEReduceMean::validate(input->info(), reduction_axis, keep_dims, output->info()));
     // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
+    const TensorShape output_shape =
+        arm_compute::misc::shape_calculator::calculate_reduce_mean_shape(input->info(), reduction_axis, keep_dims);
     auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
 
     _reduction_ops = reduction_axis.num_dimensions();
@@ -124,37 +136,40 @@
     convert_negative_axis(axis_local, input_dims);
 
     // Perform reduction for every axis
-    for(int i = 0; i < _reduction_ops; ++i)
+    for (int i = 0; i < _reduction_ops; ++i)
     {
-        TensorShape out_shape = i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
+        TensorShape out_shape =
+            i == 0 ? tmp_input->info()->tensor_shape() : (&_reduced_outs[i - 1])->info()->tensor_shape();
         out_shape.set(axis_local[i], 1);
         auto in = (i == 0) ? tmp_input : (&_reduced_outs[i - 1]);
 
-        if(i == _reduction_ops - 1 && keep_dims)
+        if (i == _reduction_ops - 1 && keep_dims)
         {
             _reduction_kernels[i].configure(in, tmp_output, axis_local[i], ReductionOperation::MEAN_SUM);
         }
         else
         {
-            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(), tmp_output->info()->data_type(), tmp_output->info()->quantization_info()));
+            _reduced_outs[i].allocator()->init(TensorInfo(out_shape, tmp_output->info()->num_channels(),
+                                                          tmp_output->info()->data_type(),
+                                                          tmp_output->info()->quantization_info()));
             _memory_group.manage(&_reduced_outs[i]);
             _reduction_kernels[i].configure(in, &_reduced_outs[i], axis_local[i], ReductionOperation::MEAN_SUM);
         }
     }
 
     // Allocate intermediate tensors
-    for(int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
+    for (int i = 0; i < _reduction_ops - (keep_dims ? 1 : 0); ++i)
     {
         _reduced_outs[i].allocator()->allocate();
     }
     // Configure reshape layer if we want to drop the dimensions
-    if(!keep_dims)
+    if (!keep_dims)
     {
         TensorShape out_shape = tmp_input->info()->tensor_shape();
         // We have to sort the reduction axis vectors in order for remove_dimension
         // to work properly
         std::sort(axis_local.begin(), axis_local.begin() + _reduction_ops);
-        for(int i = 0; i < _reduction_ops; ++i)
+        for (int i = 0; i < _reduction_ops; ++i)
         {
             out_shape.remove_dimension(axis_local[i] - i, false);
         }
@@ -166,11 +181,11 @@
 void NEReduceMean::run()
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
-    for(auto &kernel : _reduction_kernels)
+    for (auto &kernel : _reduction_kernels)
     {
         kernel.run();
     }
-    if(!_keep_dims)
+    if (!_keep_dims)
     {
         _reshape.run();
     }

diff --git a/src/runtime/NEON/functions/NEReductionOperation.cpp b/src/runtime/NEON/functions/NEReductionOperation.cpp
index 9660347..8540d75 100644
--- a/src/runtime/NEON/functions/NEReductionOperation.cpp
+++ b/src/runtime/NEON/functions/NEReductionOperation.cpp

@@ -26,9 +26,10 @@
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
-#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 #include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/NEON/kernels/NEReductionOperationKernel.h"
 
 namespace arm_compute
 {
@@ -42,7 +43,7 @@
  */
 size_t reduction_window_split_dimension(unsigned int axis)
 {
-    switch(axis)
+    switch (axis)
     {
         case 0:
             return Window::DimY;
@@ -59,13 +60,21 @@
 NEReductionOperation::~NEReductionOperation() = default;
 
 NEReductionOperation::NEReductionOperation(std::shared_ptr<IMemoryManager> memory_manager)
-    : _memory_group(memory_manager), _reduction_kernel(), _reshape(), _output_internal(), _window_split(0), _reduction_axis(), _is_reshape_required(false)
+    : _memory_group(memory_manager),
+      _reduction_kernel(),
+      _reshape(),
+      _output_internal(),
+      _window_split(0),
+      _reduction_axis(),
+      _is_reshape_required(false)
 {
 }
 
-Status NEReductionOperation::validate(const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+Status NEReductionOperation::validate(
+    const ITensorInfo *input, const ITensorInfo *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions, "Reduction axis greater than max number of dimensions");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis >= TensorShape::num_max_dimensions,
+                                    "Reduction axis greater than max number of dimensions");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(axis > 3, "Unsupported reduction axis");
 
     const auto is_reshape_required = !keep_dims;
@@ -74,9 +83,10 @@
 
     TensorInfo info_before_reshape;
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
-        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
+        const TensorInfo expected_output_shape = output->clone()->set_tensor_shape(
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->tensor_shape(), axis, keep_dims));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&expected_output_shape, output);
 
         auto shape_before_reshape = input->tensor_shape();
@@ -84,17 +94,20 @@
 
         const auto input_num_channles = input->num_channels();
         const auto input_qinfo        = input->quantization_info();
-        const auto is_arg_min_max     = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
-        const auto output_data_type   = is_arg_min_max ? DataType::S32 : output->data_type();
+        const auto is_arg_min_max = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
+        const auto output_data_type = is_arg_min_max ? DataType::S32 : output->data_type();
 
-        info_before_reshape.set_data_type(output_data_type).set_tensor_shape(shape_before_reshape).set_num_channels(input_num_channles).set_quantization_info(input_qinfo);
+        info_before_reshape.set_data_type(output_data_type)
+            .set_tensor_shape(shape_before_reshape)
+            .set_num_channels(input_num_channles)
+            .set_quantization_info(input_qinfo);
 
         output_internal = &info_before_reshape;
     }
 
     ARM_COMPUTE_RETURN_ON_ERROR(NEReductionOperationKernel::validate(input, output_internal, axis, op));
 
-    if(is_reshape_required)
+    if (is_reshape_required)
     {
         ARM_COMPUTE_RETURN_ON_ERROR(NEReshapeLayer::validate(output_internal, output));
     }
@@ -102,7 +115,8 @@
     return Status{};
 }
 
-void NEReductionOperation::configure(ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
+void NEReductionOperation::configure(
+    ITensor *input, ITensor *output, unsigned int axis, ReductionOperation op, bool keep_dims)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_LOG_PARAMS(input, output, axis, op, keep_dims);
@@ -112,19 +126,32 @@
     auto      *output_internal = output;
     const auto is_arg_min_max  = (op == ReductionOperation::ARG_IDX_MAX) || (op == ReductionOperation::ARG_IDX_MIN);
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
-        const auto output_internal_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
-        const auto output_external_shape = arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
-        const auto output_data_type      = is_arg_min_max ? DataType::S32 : input->info()->data_type();
-        const auto num_channels          = input->info()->num_channels();
-        const auto qinfo                 = input->info()->quantization_info();
+        const auto output_internal_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis);
+        const auto output_external_shape =
+            arm_compute::misc::shape_calculator::compute_reduced_shape(input->info()->tensor_shape(), axis, false);
+        const auto output_data_type = is_arg_min_max ? DataType::S32 : input->info()->data_type();
+        const auto num_channels     = input->info()->num_channels();
+        const auto qinfo            = input->info()->quantization_info();
 
-        _output_internal.allocator()->init(input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_internal_shape).reset_padding().set_is_resizable(true).set_num_channels(
-                                               num_channels).set_quantization_info(qinfo));
+        _output_internal.allocator()->init(input->info()
+                                               ->clone()
+                                               ->set_data_type(output_data_type)
+                                               .set_tensor_shape(output_internal_shape)
+                                               .reset_padding()
+                                               .set_is_resizable(true)
+                                               .set_num_channels(num_channels)
+                                               .set_quantization_info(qinfo));
         _memory_group.manage(&_output_internal);
         output_internal = &_output_internal;
-        auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_data_type).set_tensor_shape(output_external_shape).reset_padding().set_is_resizable(true));
+        auto_init_if_empty(*output->info(), input->info()
+                                                ->clone()
+                                                ->set_data_type(output_data_type)
+                                                .set_tensor_shape(output_external_shape)
+                                                .reset_padding()
+                                                .set_is_resizable(true));
     }
 
     ARM_COMPUTE_ERROR_THROW_ON(NEReductionOperation::validate(input->info(), output->info(), axis, op, keep_dims));
@@ -135,7 +162,7 @@
     _window_split   = reduction_window_split_dimension(axis);
     _reduction_axis = axis;
 
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.configure(output_internal, output);
         _output_internal.allocator()->allocate();
@@ -146,7 +173,7 @@
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
     NEScheduler::get().schedule(_reduction_kernel.get(), _window_split);
-    if(_is_reshape_required)
+    if (_is_reshape_required)
     {
         _reshape.run();
     }

diff --git a/src/runtime/NEON/functions/NEReorderLayer.cpp b/src/runtime/NEON/functions/NEReorderLayer.cpp
index 427bf8c..89cf575 100644
--- a/src/runtime/NEON/functions/NEReorderLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorderLayer.cpp

@@ -23,20 +23,24 @@
  */
 #if defined(__aarch64__)
 
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEReorderLayer.h"
+
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/core/NEON/kernels/NEReorderKernel.h"
 
 namespace arm_compute
 {
 NEReorderLayer::~NEReorderLayer() = default;
 
-NEReorderLayer::NEReorderLayer()
-    : _reorder_kernel(std::make_unique<NEReorderKernel>())
+NEReorderLayer::NEReorderLayer() : _reorder_kernel(std::make_unique<NEReorderKernel>())
 {
 }
 
-void NEReorderLayer::configure(const ITensor *input, ITensor *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+void NEReorderLayer::configure(const ITensor            *input,
+                               ITensor                  *output,
+                               arm_compute::WeightFormat input_wf,
+                               arm_compute::WeightFormat output_wf)
 {
     auto k = std::make_unique<NEReorderKernel>();
     k->configure(input, output, input_wf, output_wf);
@@ -49,11 +53,14 @@
     NEScheduler::get().schedule(_reorder_kernel.get(), Window::DimX);
 }
 
-Status NEReorderLayer::validate(const ITensorInfo *input, const ITensorInfo *output, arm_compute::WeightFormat input_wf, arm_compute::WeightFormat output_wf)
+Status NEReorderLayer::validate(const ITensorInfo        *input,
+                                const ITensorInfo        *output,
+                                arm_compute::WeightFormat input_wf,
+                                arm_compute::WeightFormat output_wf)
 {
     return NEReorderKernel::validate(input, output, input_wf, output_wf);
 }
 
 } // namespace arm_compute
 
-#endif  // defined(__aarch64__)
\ No newline at end of file
+#endif // defined(__aarch64__)

diff --git a/src/runtime/NEON/functions/NEReorgLayer.cpp b/src/runtime/NEON/functions/NEReorgLayer.cpp
index 8ee73d7..14e41d6 100644
--- a/src/runtime/NEON/functions/NEReorgLayer.cpp
+++ b/src/runtime/NEON/functions/NEReorgLayer.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReorgLayer.h"
 
-#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEReorgLayerKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/runtime/NEON/functions/NEReshapeLayer.cpp b/src/runtime/NEON/functions/NEReshapeLayer.cpp
index 3ccb423..bed70ff 100644
--- a/src/runtime/NEON/functions/NEReshapeLayer.cpp
+++ b/src/runtime/NEON/functions/NEReshapeLayer.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/cpu/operators/CpuReshape.h"
 
 #include <utility>
@@ -32,16 +33,15 @@
 {
 struct NEReshapeLayer::Impl
 {
-    const ITensor                   *src{ nullptr };
-    ITensor                         *dst{ nullptr };
-    std::unique_ptr<cpu::CpuReshape> op{ nullptr };
+    const ITensor                   *src{nullptr};
+    ITensor                         *dst{nullptr};
+    std::unique_ptr<cpu::CpuReshape> op{nullptr};
 };
 
-NEReshapeLayer::NEReshapeLayer()
-    : _impl(std::make_unique<Impl>())
+NEReshapeLayer::NEReshapeLayer() : _impl(std::make_unique<Impl>())
 {
 }
-NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&) = default;
+NEReshapeLayer::NEReshapeLayer(NEReshapeLayer &&)            = default;
 NEReshapeLayer &NEReshapeLayer::operator=(NEReshapeLayer &&) = default;
 NEReshapeLayer::~NEReshapeLayer()                            = default;
 

diff --git a/src/runtime/NEON/functions/NEReverse.cpp b/src/runtime/NEON/functions/NEReverse.cpp
index e1988f2..a90f8d2 100644
--- a/src/runtime/NEON/functions/NEReverse.cpp
+++ b/src/runtime/NEON/functions/NEReverse.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NEReverse.h"
 
-#include "src/core/NEON/kernels/NEReverseKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NEReverseKernel.h"
 
 namespace arm_compute
 {
@@ -38,7 +37,10 @@
     _kernel = std::move(k);
 }
 
-Status NEReverse::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *axis, bool use_inverted_axis)
+Status NEReverse::validate(const ITensorInfo *input,
+                           const ITensorInfo *output,
+                           const ITensorInfo *axis,
+                           bool               use_inverted_axis)
 {
     return NEReverseKernel::validate(input, output, axis, use_inverted_axis);
 }

diff --git a/src/runtime/NEON/functions/NEScale.cpp b/src/runtime/NEON/functions/NEScale.cpp
index 09f0373..0d01106 100644
--- a/src/runtime/NEON/functions/NEScale.cpp
+++ b/src/runtime/NEON/functions/NEScale.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NEScale.h"
 
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/utils/ScaleUtils.h"
 #include "src/cpu/operators/CpuScale.h"
@@ -32,16 +33,16 @@
 {
 struct NEScale::Impl
 {
-    const ITensor                 *src{ nullptr };
-    ITensor                       *dst{ nullptr };
-    Tensor                         dx{ nullptr };      /**< Element's distance between the X real coordinate and the smallest X following integer */
-    Tensor                         dy{ nullptr };      /**< Element's distance between the Y real coordinate and the smallest Y following integer */
-    Tensor                         offsets{ nullptr }; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
-    std::unique_ptr<cpu::CpuScale> op{ nullptr };
+    const ITensor *src{nullptr};
+    ITensor       *dst{nullptr};
+    Tensor dx{nullptr}; /**< Element's distance between the X real coordinate and the smallest X following integer */
+    Tensor dy{nullptr}; /**< Element's distance between the Y real coordinate and the smallest Y following integer */
+    Tensor offsets{
+        nullptr}; /**< Offset to access the element with NEAREST interpolation or the top-left element with BILINEAR interpolation in the input tensor */
+    std::unique_ptr<cpu::CpuScale> op{nullptr};
 };
 
-NEScale::NEScale()
-    : _impl(std::make_unique<Impl>())
+NEScale::NEScale() : _impl(std::make_unique<Impl>())
 {
 }
 NEScale::~NEScale() = default;
@@ -57,25 +58,33 @@
 
     // Configure for size of allocation of internal tensors
     // Get data layout and width/height indices
-    const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
-    const int        idx_width   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const int        idx_height  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+    const DataLayout data_layout =
+        info.data_layout == DataLayout::UNKNOWN ? input->info()->data_layout() : info.data_layout;
+    const int idx_width  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+    const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
 
     // Compute the ratio between source width/height and destination width/height
-    const bool is_align_corners_used = info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
-    const auto wr                    = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
-    const auto hr                    = arm_compute::scale_utils::calculate_resize_ratio(input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
+    const bool is_align_corners_used =
+        info.align_corners && arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy);
+    const auto wr = arm_compute::scale_utils::calculate_resize_ratio(
+        input->info()->dimension(idx_width), output->info()->dimension(idx_width), is_align_corners_used);
+    const auto hr = arm_compute::scale_utils::calculate_resize_ratio(
+        input->info()->dimension(idx_height), output->info()->dimension(idx_height), is_align_corners_used);
 
     // Area interpolation behaves as Nearest Neighbour in case of up-sampling
-    InterpolationPolicy policy_to_use = (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f) ? InterpolationPolicy::NEAREST_NEIGHBOR : info.interpolation_policy;
+    InterpolationPolicy policy_to_use =
+        (info.interpolation_policy == InterpolationPolicy::AREA && wr <= 1.f && hr <= 1.f)
+            ? InterpolationPolicy::NEAREST_NEIGHBOR
+            : info.interpolation_policy;
 
     // Get the tensor shape
     TensorShape shape(output->info()->dimension(idx_width));
     shape.set(1, output->info()->dimension(idx_height), false);
 
-    bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(data_layout, input->info()->data_type(), policy_to_use, info.border_mode);
+    bool precompute_indices_weights = arm_compute::scale_utils::is_precomputation_required(
+        data_layout, input->info()->data_type(), policy_to_use, info.border_mode);
 
-    if(precompute_indices_weights)
+    if (precompute_indices_weights)
     {
         const TensorInfo tensor_info_dxdy(shape, Format::F32);
         const TensorInfo tensor_info_offsets(shape, Format::S32);
@@ -83,7 +92,7 @@
         _impl->dx.allocator()->init(tensor_info_dxdy);
         _impl->dy.allocator()->init(tensor_info_dxdy);
         _impl->offsets.allocator()->init(tensor_info_offsets);
-        switch(policy_to_use)
+        switch (policy_to_use)
         {
             case InterpolationPolicy::NEAREST_NEIGHBOR:
             {
@@ -109,7 +118,8 @@
     }
     else
     {
-        if(policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR && policy_to_use != InterpolationPolicy::AREA)
+        if (policy_to_use != InterpolationPolicy::NEAREST_NEIGHBOR && policy_to_use != InterpolationPolicy::BILINEAR &&
+            policy_to_use != InterpolationPolicy::AREA)
         {
             ARM_COMPUTE_ERROR("Unsupported interpolation mode");
         }

diff --git a/src/runtime/NEON/functions/NESelect.cpp b/src/runtime/NEON/functions/NESelect.cpp
index 26c2eb8..55cad22 100644
--- a/src/runtime/NEON/functions/NESelect.cpp
+++ b/src/runtime/NEON/functions/NESelect.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NESelect.h"
 
 #include "arm_compute/core/Types.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NESelectKernel.h"
 

diff --git a/src/runtime/NEON/functions/NESlice.cpp b/src/runtime/NEON/functions/NESlice.cpp
index 4a8912b..12d43ad 100644
--- a/src/runtime/NEON/functions/NESlice.cpp
+++ b/src/runtime/NEON/functions/NESlice.cpp

@@ -25,8 +25,9 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/helpers/tensor_transform.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
@@ -34,7 +35,10 @@
 {
 namespace experimental
 {
-void NESlice::configure(const ITensorInfo *input, ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+void NESlice::configure(const ITensorInfo *input,
+                        ITensorInfo       *output,
+                        const Coordinates &starts,
+                        const Coordinates &ends)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends);
@@ -47,15 +51,16 @@
     _kernel = std::move(k);
 }
 
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
 
     // Check start dimensions for being non-negative
-    ARM_COMPUTE_RETURN_ERROR_ON(std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i)
-    {
-        return i < 0;
-    }));
+    ARM_COMPUTE_RETURN_ERROR_ON(
+        std::any_of(starts.cbegin(), starts.cbegin() + starts.num_dimensions(), [](int i) { return i < 0; }));
 
     // Get absolute end coordinates
     const int32_t slice_end_mask = arm_compute::helpers::tensor_transform::construct_slice_end_mask(ends);
@@ -66,20 +71,22 @@
 
 struct NESlice::Impl
 {
-    const ITensor                         *src{ nullptr };
-    ITensor                               *dst{ nullptr };
-    std::unique_ptr<experimental::NESlice> op{ nullptr };
+    const ITensor                         *src{nullptr};
+    ITensor                               *dst{nullptr};
+    std::unique_ptr<experimental::NESlice> op{nullptr};
 };
 
-NESlice::NESlice()
-    : _impl(std::make_unique<Impl>())
+NESlice::NESlice() : _impl(std::make_unique<Impl>())
 {
 }
-NESlice::NESlice(NESlice &&) = default;
+NESlice::NESlice(NESlice &&)            = default;
 NESlice &NESlice::operator=(NESlice &&) = default;
 NESlice::~NESlice()                     = default;
 
-Status NESlice::validate(const ITensorInfo *input, const ITensorInfo *output, const Coordinates &starts, const Coordinates &ends)
+Status NESlice::validate(const ITensorInfo *input,
+                         const ITensorInfo *output,
+                         const Coordinates &starts,
+                         const Coordinates &ends)
 {
     return experimental::NESlice::validate(input, output, starts, ends);
 }

diff --git a/src/runtime/NEON/functions/NESoftmaxLayer.cpp b/src/runtime/NEON/functions/NESoftmaxLayer.cpp
index 0947ff9..e3c2012 100644
--- a/src/runtime/NEON/functions/NESoftmaxLayer.cpp
+++ b/src/runtime/NEON/functions/NESoftmaxLayer.cpp

@@ -22,9 +22,11 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/NEON/functions/NESoftmaxLayer.h"
+
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+
 #include "src/core/helpers/MemoryHelpers.h"
 #include "src/core/helpers/SoftmaxHelpers.h"
 #include "src/cpu/kernels/CpuSoftmaxKernel.h"
@@ -35,10 +37,10 @@
 template <bool IS_LOG>
 struct NESoftmaxLayerGeneric<IS_LOG>::Impl
 {
-    const ITensor                                  *src{ nullptr };
-    ITensor                                        *dst{ nullptr };
-    Tensor                                          max{ nullptr };
-    std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{ nullptr };
+    const ITensor                                  *src{nullptr};
+    ITensor                                        *dst{nullptr};
+    Tensor                                          max{nullptr};
+    std::unique_ptr<cpu::CpuSoftmaxGeneric<IS_LOG>> op{nullptr};
     MemoryGroup                                     memory_group{};
     ITensorPack                                     run_pack{};
     WorkspaceData<Tensor>                           workspace_tensors{};
@@ -53,9 +55,9 @@
 
 template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::NESoftmaxLayerGeneric(NESoftmaxLayerGeneric &&) = default;
-template <bool                 IS_LOG>
+template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG> &NESoftmaxLayerGeneric<IS_LOG>::operator=(NESoftmaxLayerGeneric &&) = default;
-template <bool                 IS_LOG>
+template <bool IS_LOG>
 NESoftmaxLayerGeneric<IS_LOG>::~NESoftmaxLayerGeneric() = default;
 
 template <bool IS_LOG>
@@ -68,12 +70,13 @@
     _impl->op  = std::make_unique<cpu::CpuSoftmaxGeneric<IS_LOG>>();
     _impl->op->configure(input->info(), output->info(), beta, axis);
 
-    _impl->run_pack          = { { TensorType::ACL_SRC, _impl->src }, { TensorType::ACL_DST, _impl->dst } };
+    _impl->run_pack          = {{TensorType::ACL_SRC, _impl->src}, {TensorType::ACL_DST, _impl->dst}};
     _impl->workspace_tensors = manage_workspace<Tensor>(_impl->op->workspace(), _impl->memory_group, _impl->run_pack);
 }
 
 template <bool IS_LOG>
-Status NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
+Status
+NESoftmaxLayerGeneric<IS_LOG>::validate(const ITensorInfo *input, const ITensorInfo *output, float beta, int32_t axis)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
     ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuSoftmaxGeneric<IS_LOG>::validate(input, output, beta, axis));
@@ -81,7 +84,7 @@
 }
 
 template <bool IS_LOG>
-void           NESoftmaxLayerGeneric<IS_LOG>::run()
+void NESoftmaxLayerGeneric<IS_LOG>::run()
 {
     // Acquire all the temporaries
     MemoryGroupResourceScope scope_mg(_impl->memory_group);

diff --git a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
index c450951..556ebdd 100644
--- a/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToBatchLayer.cpp

@@ -28,8 +28,9 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
 #include "arm_compute/runtime/NEON/functions/NEFill.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NESpaceToBatchLayerKernel.h"
 
@@ -37,17 +38,19 @@
 {
 NESpaceToBatchLayer::~NESpaceToBatchLayer() = default;
 
-NESpaceToBatchLayer::NESpaceToBatchLayer()
-    : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
+NESpaceToBatchLayer::NESpaceToBatchLayer() : _space_to_batch_kernel(), _fill_f(), _has_padding(false)
 {
 }
 
-void NESpaceToBatchLayer::configure(const ITensor *input, const ITensor *block_shape, const ITensor *paddings, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+                                    const ITensor *block_shape,
+                                    const ITensor *paddings,
+                                    ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
     ARM_COMPUTE_LOG_PARAMS(input, block_shape, paddings, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
         _fill_f      = std::make_unique<NEFill>();
@@ -57,11 +60,16 @@
     _space_to_batch_kernel->configure(input, block_shape, paddings, output);
 }
 
-void NESpaceToBatchLayer::configure(const ITensor *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right, ITensor *output)
+void NESpaceToBatchLayer::configure(const ITensor *input,
+                                    const int      block_shape_x,
+                                    const int      block_shape_y,
+                                    const Size2D  &padding_left,
+                                    const Size2D  &padding_right,
+                                    ITensor       *output)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
 
-    if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
+    if (input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
         _fill_f      = std::make_unique<NEFill>();
@@ -71,17 +79,25 @@
     _space_to_batch_kernel->configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const ITensorInfo *block_shape,
+                                     const ITensorInfo *paddings,
+                                     const ITensorInfo *output)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
 
     return Status{};
 }
 
-Status NESpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
+Status NESpaceToBatchLayer::validate(const ITensorInfo *input,
+                                     const int          block_shape_x,
+                                     const int          block_shape_y,
+                                     const Size2D      &padding_left,
+                                     const Size2D      &padding_right,
                                      const ITensorInfo *output)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+    ARM_COMPUTE_RETURN_ON_ERROR(
+        NESpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
 
     return Status{};
 }
@@ -89,7 +105,7 @@
 void NESpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    if(_has_padding)
+    if (_has_padding)
     {
         _fill_f->run();
     }

diff --git a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
index b37bf0d..846b619 100644
--- a/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp
+++ b/src/runtime/NEON/functions/NESpaceToDepthLayer.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NESpaceToDepthLayerKernel.h"
 
@@ -36,8 +37,7 @@
 {
 NESpaceToDepthLayer::~NESpaceToDepthLayer() = default;
 
-NESpaceToDepthLayer::NESpaceToDepthLayer()
-    : _space_to_depth_kernel()
+NESpaceToDepthLayer::NESpaceToDepthLayer() : _space_to_depth_kernel()
 {
 }
 

diff --git a/src/runtime/NEON/functions/NESplit.cpp b/src/runtime/NEON/functions/NESplit.cpp
index db19bbb..53b09e9 100644
--- a/src/runtime/NEON/functions/NESplit.cpp
+++ b/src/runtime/NEON/functions/NESplit.cpp

@@ -34,7 +34,7 @@
 {
 void NESplit::run()
 {
-    for(unsigned i = 0; i < _num_outputs; ++i)
+    for (unsigned i = 0; i < _num_outputs; ++i)
     {
         _slice_functions[i].run();
     }

diff --git a/src/runtime/NEON/functions/NEStackLayer.cpp b/src/runtime/NEON/functions/NEStackLayer.cpp
index 68554e0..03e7026 100644
--- a/src/runtime/NEON/functions/NEStackLayer.cpp
+++ b/src/runtime/NEON/functions/NEStackLayer.cpp

@@ -30,6 +30,7 @@
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEStackLayerKernel.h"
 
@@ -38,9 +39,7 @@
 NEStackLayer::~NEStackLayer() = default;
 
 NEStackLayer::NEStackLayer() // NOLINT
-    : _input(),
-      _stack_kernels(),
-      _num_inputs(0)
+    : _input(), _stack_kernels(), _num_inputs(0)
 {
 }
 
@@ -54,7 +53,7 @@
     // Wrap around negative values
     const unsigned int axis_u = wrap_around(axis, static_cast<int>(input[0]->info()->num_dimensions() + 1));
 
-    for(unsigned int i = 0; i < _num_inputs; i++)
+    for (unsigned int i = 0; i < _num_inputs; i++)
     {
         _stack_kernels[i] = std::make_unique<NEStackLayerKernel>();
         _stack_kernels[i]->configure(input[i], axis_u, i, _num_inputs, output);
@@ -72,7 +71,7 @@
 
     const unsigned int num_inputs = input.size();
 
-    for(unsigned int i = 0; i < num_inputs; i++)
+    for (unsigned int i = 0; i < num_inputs; i++)
     {
         // All the tensors must have the same rank
         ARM_COMPUTE_RETURN_ERROR_ON(input[i]->num_dimensions() != rank);
@@ -85,7 +84,7 @@
 
 void NEStackLayer::run()
 {
-    for(unsigned i = 0; i < _num_inputs; i++)
+    for (unsigned i = 0; i < _num_inputs; i++)
     {
         NEScheduler::get().schedule(_stack_kernels[i].get(), Window::DimY);
     }

diff --git a/src/runtime/NEON/functions/NEStridedSlice.cpp b/src/runtime/NEON/functions/NEStridedSlice.cpp
index 4f50749..6a3ac8b 100644
--- a/src/runtime/NEON/functions/NEStridedSlice.cpp
+++ b/src/runtime/NEON/functions/NEStridedSlice.cpp

@@ -25,6 +25,7 @@
 
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/common/utils/Log.h"
 #include "src/core/NEON/kernels/NEStridedSliceKernel.h"
 
@@ -32,9 +33,14 @@
 {
 namespace experimental
 {
-void NEStridedSlice::configure(const ITensorInfo *input, ITensorInfo *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensorInfo *input,
+                               ITensorInfo       *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
     ARM_COMPUTE_LOG_PARAMS(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 
@@ -43,9 +49,14 @@
     _kernel = std::move(k);
 }
 
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
     return NEStridedSliceKernel::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
 }
@@ -53,22 +64,26 @@
 
 struct NEStridedSlice::Impl
 {
-    const ITensor                                *src{ nullptr };
-    ITensor                                      *dst{ nullptr };
-    std::unique_ptr<experimental::NEStridedSlice> op{ nullptr };
+    const ITensor                                *src{nullptr};
+    ITensor                                      *dst{nullptr};
+    std::unique_ptr<experimental::NEStridedSlice> op{nullptr};
 };
 
-NEStridedSlice::NEStridedSlice()
-    : _impl(std::make_unique<Impl>())
+NEStridedSlice::NEStridedSlice() : _impl(std::make_unique<Impl>())
 {
 }
-NEStridedSlice::NEStridedSlice(NEStridedSlice &&) = default;
+NEStridedSlice::NEStridedSlice(NEStridedSlice &&)            = default;
 NEStridedSlice &NEStridedSlice::operator=(NEStridedSlice &&) = default;
 NEStridedSlice::~NEStridedSlice()                            = default;
 
-void NEStridedSlice::configure(const ITensor *input, ITensor *output,
-                               const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                               int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+void NEStridedSlice::configure(const ITensor     *input,
+                               ITensor           *output,
+                               const Coordinates &starts,
+                               const Coordinates &ends,
+                               const BiStrides   &strides,
+                               int32_t            begin_mask,
+                               int32_t            end_mask,
+                               int32_t            shrink_axis_mask)
 {
     _impl->src = input;
     _impl->dst = output;
@@ -84,10 +99,16 @@
     _impl->op->run(pack);
 }
 
-Status NEStridedSlice::validate(const ITensorInfo *input, const ITensorInfo *output,
-                                const Coordinates &starts, const Coordinates &ends, const BiStrides &strides,
-                                int32_t begin_mask, int32_t end_mask, int32_t shrink_axis_mask)
+Status NEStridedSlice::validate(const ITensorInfo *input,
+                                const ITensorInfo *output,
+                                const Coordinates &starts,
+                                const Coordinates &ends,
+                                const BiStrides   &strides,
+                                int32_t            begin_mask,
+                                int32_t            end_mask,
+                                int32_t            shrink_axis_mask)
 {
-    return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask, shrink_axis_mask);
+    return experimental::NEStridedSlice::validate(input, output, starts, ends, strides, begin_mask, end_mask,
+                                                  shrink_axis_mask);
 }
 } // namespace arm_compute

diff --git a/src/runtime/NEON/functions/NETile.cpp b/src/runtime/NEON/functions/NETile.cpp
index 526603f..d10b1c8 100644
--- a/src/runtime/NEON/functions/NETile.cpp
+++ b/src/runtime/NEON/functions/NETile.cpp

@@ -23,9 +23,8 @@
  */
 #include "arm_compute/runtime/NEON/functions/NETile.h"
 
-#include "src/core/NEON/kernels/NETileKernel.h"
-
 #include "src/common/utils/Log.h"
+#include "src/core/NEON/kernels/NETileKernel.h"
 
 namespace arm_compute
 {

diff --git a/src/runtime/NEON/functions/NETranspose.cpp b/src/runtime/NEON/functions/NETranspose.cpp
index 78c7ea2..0144a85 100644
--- a/src/runtime/NEON/functions/NETranspose.cpp
+++ b/src/runtime/NEON/functions/NETranspose.cpp

@@ -24,6 +24,7 @@
 #include "arm_compute/runtime/NEON/functions/NETranspose.h"
 
 #include "arm_compute/core/Validate.h"
+
 #include "src/common/utils/Log.h"
 #include "src/cpu/operators/CpuTranspose.h"
 
@@ -31,13 +32,12 @@
 {
 struct NETranspose::Impl
 {
-    const ITensor                     *src{ nullptr };
-    ITensor                           *dst{ nullptr };
-    std::unique_ptr<cpu::CpuTranspose> op{ nullptr };
+    const ITensor                     *src{nullptr};
+    ITensor                           *dst{nullptr};
+    std::unique_ptr<cpu::CpuTranspose> op{nullptr};
 };
 
-NETranspose::NETranspose()
-    : _impl(std::make_unique<Impl>())
+NETranspose::NETranspose() : _impl(std::make_unique<Impl>())
 {
 }
 

diff --git a/src/runtime/NEON/functions/NEUnstack.cpp b/src/runtime/NEON/functions/NEUnstack.cpp
index 0ffab5e..2f7ed2b 100644
--- a/src/runtime/NEON/functions/NEUnstack.cpp
+++ b/src/runtime/NEON/functions/NEUnstack.cpp

@@ -28,6 +28,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include "src/common/utils/Log.h"
 
 namespace arm_compute
@@ -39,13 +40,15 @@
     return wrap_around(axis, static_cast<int>(tensor->num_dimensions()));
 }
 
-inline void setup_slice_coordinates_and_mask(Coordinates &slice_start, int32_t &slice_end_mask, const unsigned int input_num_dimensions)
+inline void setup_slice_coordinates_and_mask(Coordinates       &slice_start,
+                                             int32_t           &slice_end_mask,
+                                             const unsigned int input_num_dimensions)
 {
     // Setups up coordinates to slice the input tensor: start coordinates to all 0s and the unstacking axis of both Start/End to slice just one 2d tensor at a time.
     Coordinates slice_end;
     slice_start.set_num_dimensions(input_num_dimensions);
     slice_end.set_num_dimensions(input_num_dimensions);
-    for(size_t k = 0; k < input_num_dimensions; ++k)
+    for (size_t k = 0; k < input_num_dimensions; ++k)
     {
         slice_start.set(k, 0);
         slice_end.set(k, -1);
@@ -55,19 +58,19 @@
 } // namespace
 
 NEUnstack::NEUnstack() // NOLINT
-    : _num_slices(0),
-      _strided_slice_vector()
+    : _num_slices(0), _strided_slice_vector()
 {
 }
 
 void NEUnstack::configure(const ITensor *input, const std::vector<ITensor *> &output_vector, int axis)
 {
     std::vector<ITensorInfo *> outputs_vector_info(output_vector.size());
-    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(), [](ITensor * t)
-    {
-        ARM_COMPUTE_ERROR_ON_NULLPTR(t);
-        return t->info();
-    });
+    std::transform(output_vector.begin(), output_vector.end(), outputs_vector_info.begin(),
+                   [](ITensor *t)
+                   {
+                       ARM_COMPUTE_ERROR_ON_NULLPTR(t);
+                       return t->info();
+                   });
 
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(NEUnstack::validate(input->info(), outputs_vector_info, axis));
@@ -81,11 +84,12 @@
     Coordinates slice_start;
     int32_t     slice_end_mask;
     setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->info()->tensor_shape().num_dimensions());
-    for(unsigned int slice = 0; slice < _num_slices; ++slice)
+    for (unsigned int slice = 0; slice < _num_slices; ++slice)
     {
         // Adjusts start and end coordinates to take a 2D slice at a time
         slice_start.set(axis_u, slice);
-        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << axis_u));
+        _strided_slice_vector[slice].configure(input, output_vector[slice], slice_start, Coordinates(), BiStrides(), 0,
+                                               slice_end_mask, (1 << axis_u));
     }
 }
 
@@ -102,18 +106,20 @@
 
     Coordinates slice_start;
     int32_t     slice_end_mask;
-    for(size_t k = 0; k < num_slices; ++k)
+    for (size_t k = 0; k < num_slices; ++k)
     {
         slice_start.set(wrap_axis(axis, input), k);
         setup_slice_coordinates_and_mask(slice_start, slice_end_mask, input->tensor_shape().num_dimensions());
-        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(), BiStrides(), 0, slice_end_mask, (1 << wrap_axis(axis, input))));
+        ARM_COMPUTE_RETURN_ON_ERROR(NEStridedSlice::validate(input, output_vector[k], slice_start, Coordinates(),
+                                                             BiStrides(), 0, slice_end_mask,
+                                                             (1 << wrap_axis(axis, input))));
     }
     return Status{};
 }
 
 void NEUnstack::run()
 {
-    for(unsigned i = 0; i < _num_slices; ++i)
+    for (unsigned i = 0; i < _num_slices; ++i)
     {
         _strided_slice_vector[i].run();
     }

diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index a8eded2..8d77abc 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp

@@ -26,15 +26,15 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/Utils.h"
-#include "arm_compute/core/Validate.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/Validate.h"
+
 #include "src/core/CPP/Validate.h"
 #include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/NEON/kernels/convolution/common/utils.hpp"
 #include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuWinogradConv2d.h"
 
-#include "src/core/NEON/kernels/convolution/common/utils.hpp"
-
 namespace arm_compute
 {
 using namespace arm_compute::experimental;
@@ -42,14 +42,14 @@
 struct NEWinogradConvolutionLayer::Impl
 {
     MemoryGroup                             memory_group{};
-    std::unique_ptr<cpu::CpuWinogradConv2d> op{ nullptr };
+    std::unique_ptr<cpu::CpuWinogradConv2d> op{nullptr};
     ITensorPack                             run_pack{};
     ITensorPack                             prep_pack{};
     WorkspaceData<Tensor>                   workspace{};
     experimental::MemoryRequirements        aux_mem_req{};
-    const ITensor                          *original_weights{ nullptr };
-    bool                                    is_prepared{ false };
-    bool                                    is_activationlayer_enabled{ false };
+    const ITensor                          *original_weights{nullptr};
+    bool                                    is_prepared{false};
+    bool                                    is_activationlayer_enabled{false};
     DataLayout                              data_layout{};
 };
 
@@ -61,17 +61,24 @@
 
 NEWinogradConvolutionLayer::~NEWinogradConvolutionLayer() = default;
 
-void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
-                                           bool enable_fast_math)
+void NEWinogradConvolutionLayer::configure(const ITensor             *input,
+                                           const ITensor             *weights,
+                                           const ITensor             *biases,
+                                           ITensor                   *output,
+                                           const PadStrideInfo       &conv_info,
+                                           const ActivationLayerInfo &act_info,
+                                           bool                       enable_fast_math)
 {
     _impl->original_weights = weights;
     _impl->op               = std::make_unique<cpu::CpuWinogradConv2d>();
-    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(), conv_info, act_info, enable_fast_math);
+    _impl->op->configure(input->info(), weights->info(), biases != nullptr ? biases->info() : nullptr, output->info(),
+                         conv_info, act_info, enable_fast_math);
 
     _impl->aux_mem_req = _impl->op->workspace();
-    _impl->run_pack    = { { ACL_SRC_0, input }, { ACL_SRC_1, weights }, { ACL_SRC_2, biases }, { ACL_DST, output } };
-    _impl->prep_pack   = { { ACL_SRC_1, weights }, { ACL_SRC_2, biases } };
-    _impl->workspace   = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
+    _impl->run_pack    = {{ACL_SRC_0, input}, {ACL_SRC_1, weights}, {ACL_SRC_2, biases}, {ACL_DST, output}};
+    _impl->prep_pack   = {{ACL_SRC_1, weights}, {ACL_SRC_2, biases}};
+    _impl->workspace =
+        manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
 }
 
 void NEWinogradConvolutionLayer::run()
@@ -82,15 +89,20 @@
     _impl->op->run(_impl->run_pack);
 }
 
-Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
-                                            const ActivationLayerInfo &act_info, bool enable_fast_math)
+Status NEWinogradConvolutionLayer::validate(const ITensorInfo         *input,
+                                            const ITensorInfo         *weights,
+                                            const ITensorInfo         *biases,
+                                            const ITensorInfo         *output,
+                                            const PadStrideInfo       &conv_info,
+                                            const ActivationLayerInfo &act_info,
+                                            bool                       enable_fast_math)
 {
     return cpu::CpuWinogradConv2d::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
 }
 
 void NEWinogradConvolutionLayer::prepare()
 {
-    if(!_impl->is_prepared)
+    if (!_impl->is_prepared)
     {
         _impl->op->prepare(_impl->prep_pack);
         _impl->original_weights->mark_as_unused();

diff --git a/src/runtime/OMP/OMPScheduler.cpp b/src/runtime/OMP/OMPScheduler.cpp
index b0a5532..d4d6193 100644
--- a/src/runtime/OMP/OMPScheduler.cpp
+++ b/src/runtime/OMP/OMPScheduler.cpp

@@ -27,6 +27,7 @@
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/Utils.h"
+
 #include <omp.h>
 
 namespace arm_compute
@@ -63,7 +64,7 @@
     const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
     const unsigned int num_threads    = std::min(num_iterations, _num_threads);
 
-    if(!kernel->is_parallelisable() || num_threads == 1)
+    if (!kernel->is_parallelisable() || num_threads == 1)
     {
         ThreadInfo info;
         info.cpu_info = &cpu_info();
@@ -73,10 +74,10 @@
     {
         const unsigned int                num_windows = num_threads;
         std::vector<IScheduler::Workload> workloads(num_windows);
-        for(unsigned int t = 0; t < num_windows; t++)
+        for (unsigned int t = 0; t < num_windows; t++)
         {
             //Capture 't' by copy, all the other variables by reference:
-            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo & info)
+            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel, &tensors](const ThreadInfo &info)
             {
                 Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
                 win.validate();
@@ -92,7 +93,7 @@
     const unsigned int amount_of_work     = static_cast<unsigned int>(workloads.size());
     const unsigned int num_threads_to_use = std::min(_num_threads, amount_of_work);
 
-    if(num_threads_to_use < 1)
+    if (num_threads_to_use < 1)
     {
         return;
     }
@@ -100,8 +101,9 @@
     ThreadInfo info;
     info.cpu_info    = &cpu_info();
     info.num_threads = num_threads_to_use;
-    #pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) schedule(static, 1)
-    for(unsigned int wid = 0; wid < amount_of_work; ++wid)
+#pragma omp parallel for firstprivate(info) num_threads(num_threads_to_use) default(shared) proc_bind(close) \
+    schedule(static, 1)
+    for (unsigned int wid = 0; wid < amount_of_work; ++wid)
     {
         const int tid = omp_get_thread_num();
 

diff --git a/src/runtime/OffsetLifetimeManager.cpp b/src/runtime/OffsetLifetimeManager.cpp
index a47fa18..d746f61 100644
--- a/src/runtime/OffsetLifetimeManager.cpp
+++ b/src/runtime/OffsetLifetimeManager.cpp

@@ -43,8 +43,7 @@
     return (remainder != 0U) ? offset + (alignment - remainder) : offset;
 }
 } // namespace
-OffsetLifetimeManager::OffsetLifetimeManager()
-    : _blob(0)
+OffsetLifetimeManager::OffsetLifetimeManager() : _blob(0)
 {
 }
 
@@ -71,21 +70,22 @@
 
     // Update blob size
     size_t max_aggregated_size = 0;
-    std::for_each(std::begin(_free_blobs), std::end(_free_blobs), [&](const Blob & b)
-    {
-        max_aggregated_size += b.max_size;
-        _blob.alignment = std::max(_blob.alignment, b.max_alignment);
-    });
+    std::for_each(std::begin(_free_blobs), std::end(_free_blobs),
+                  [&](const Blob &b)
+                  {
+                      max_aggregated_size += b.max_size;
+                      _blob.alignment = std::max(_blob.alignment, b.max_alignment);
+                  });
     max_aggregated_size += _free_blobs.size() * _blob.alignment;
     _blob.owners = std::max(_blob.owners, _free_blobs.size());
     _blob.size   = std::max(_blob.size, max_aggregated_size);
 
     // Calculate group mappings
-    auto &group_mappings = _active_group->mappings();
+    auto  &group_mappings = _active_group->mappings();
     size_t offset         = 0;
-    for(auto &free_blob : _free_blobs)
+    for (auto &free_blob : _free_blobs)
     {
-        for(auto &bound_element_id : free_blob.bound_elements)
+        for (auto &bound_element_id : free_blob.bound_elements)
         {
             ARM_COMPUTE_ERROR_ON(_active_elements.find(bound_element_id) == std::end(_active_elements));
             Element &bound_element               = _active_elements[bound_element_id];

diff --git a/src/runtime/OffsetMemoryPool.cpp b/src/runtime/OffsetMemoryPool.cpp
index ffedf55..8f3c1a8 100644
--- a/src/runtime/OffsetMemoryPool.cpp
+++ b/src/runtime/OffsetMemoryPool.cpp

@@ -21,8 +21,6 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include <algorithm>
-
 #include "arm_compute/runtime/OffsetMemoryPool.h"
 
 #include "arm_compute/core/Error.h"
@@ -31,6 +29,8 @@
 #include "arm_compute/runtime/MemoryRegion.h"
 #include "arm_compute/runtime/Types.h"
 
+#include <algorithm>
+
 namespace arm_compute
 {
 OffsetMemoryPool::OffsetMemoryPool(IAllocator *allocator, BlobInfo blob_info)
@@ -50,7 +50,7 @@
     ARM_COMPUTE_ERROR_ON(_blob == nullptr);
 
     // Set memory to handlers
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_owned_region(_blob->extract_subregion(handle.second, _blob_info.size - handle.second));
@@ -59,7 +59,7 @@
 
 void OffsetMemoryPool::release(MemoryMappings &handles)
 {
-    for(auto &handle : handles)
+    for (auto &handle : handles)
     {
         ARM_COMPUTE_ERROR_ON(handle.first == nullptr);
         handle.first->set_region(nullptr);

diff --git a/src/runtime/OperatorTensor.cpp b/src/runtime/OperatorTensor.cpp
index a8ad53d..19415b3 100644
--- a/src/runtime/OperatorTensor.cpp
+++ b/src/runtime/OperatorTensor.cpp

@@ -22,6 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/runtime/OperatorTensor.h"
+
 #include "arm_compute/runtime/MemoryRegion.h"
 
 #include "support/Cast.h"
@@ -47,7 +48,7 @@
 
 uint8_t *OperatorTensor::buffer() const
 {
-    switch(_mem_type)
+    switch (_mem_type)
     {
         case MemoryType::CPU:
             return (uint8_t *)utils::cast::polymorphic_downcast<MemoryRegion *>(_memory->region())->buffer();

diff --git a/src/runtime/PoolManager.cpp b/src/runtime/PoolManager.cpp
index 87376a7..7fb9bd8 100644
--- a/src/runtime/PoolManager.cpp
+++ b/src/runtime/PoolManager.cpp

@@ -31,8 +31,7 @@
 
 using namespace arm_compute;
 
-PoolManager::PoolManager()
-    : _free_pools(), _occupied_pools(), _sem(), _mtx()
+PoolManager::PoolManager() : _free_pools(), _occupied_pools(), _sem(), _mtx()
 {
 }
 
@@ -52,10 +51,8 @@
     ARM_COMPUTE_ERROR_ON_MSG(_free_pools.empty() && _occupied_pools.empty(), "Haven't setup any pools!");
 
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
-    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools), [pool](const std::unique_ptr<IMemoryPool> &pool_it)
-    {
-        return pool_it.get() == pool;
-    });
+    auto it = std::find_if(std::begin(_occupied_pools), std::end(_occupied_pools),
+                           [pool](const std::unique_ptr<IMemoryPool> &pool_it) { return pool_it.get() == pool; });
     ARM_COMPUTE_ERROR_ON_MSG(it == std::end(_occupied_pools), "Pool to be unlocked couldn't be found!");
     _free_pools.splice(std::begin(_free_pools), _occupied_pools, it);
     _sem->signal();
@@ -78,7 +75,7 @@
     arm_compute::lock_guard<arm_compute::Mutex> lock(_mtx);
     ARM_COMPUTE_ERROR_ON_MSG(!_occupied_pools.empty(), "All pools should be free in order to release one!");
 
-    if(!_free_pools.empty())
+    if (!_free_pools.empty())
     {
         std::unique_ptr<IMemoryPool> pool = std::move(_free_pools.front());
         ARM_COMPUTE_ERROR_ON(_free_pools.front() != nullptr);

diff --git a/src/runtime/RuntimeContext.cpp b/src/runtime/RuntimeContext.cpp
index d1dea06..1de8d2a 100644
--- a/src/runtime/RuntimeContext.cpp
+++ b/src/runtime/RuntimeContext.cpp

@@ -28,8 +28,7 @@
 
 namespace arm_compute
 {
-RuntimeContext::RuntimeContext()
-    : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get())
+RuntimeContext::RuntimeContext() : _owned_scheduler(SchedulerFactory::create()), _scheduler(_owned_scheduler.get())
 {
 }
 

diff --git a/src/runtime/Scheduler.cpp b/src/runtime/Scheduler.cpp
index 0713b9a..e52fb59 100644
--- a/src/runtime/Scheduler.cpp
+++ b/src/runtime/Scheduler.cpp

@@ -76,7 +76,7 @@
 
 bool Scheduler::is_available(Type t)
 {
-    if(t == Type::CUSTOM)
+    if (t == Type::CUSTOM)
     {
         return _custom_scheduler != nullptr;
     }
@@ -93,11 +93,12 @@
 
 IScheduler &Scheduler::get()
 {
-    if(_scheduler_type == Type::CUSTOM)
+    if (_scheduler_type == Type::CUSTOM)
     {
-        if(_custom_scheduler == nullptr)
+        if (_custom_scheduler == nullptr)
         {
-            ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) before Scheduler::get()");
+            ARM_COMPUTE_ERROR("No custom scheduler has been setup. Call set(std::shared_ptr<IScheduler> &scheduler) "
+                              "before Scheduler::get()");
         }
         else
         {
@@ -106,13 +107,13 @@
     }
     else
     {
-        if(_schedulers.empty())
+        if (_schedulers.empty())
         {
             _schedulers = init();
         }
 
         auto it = _schedulers.find(_scheduler_type);
-        if(it != _schedulers.end())
+        if (it != _schedulers.end())
         {
             return *it->second;
         }

diff --git a/src/runtime/SchedulerFactory.cpp b/src/runtime/SchedulerFactory.cpp
index cc21d62..4fb08d7 100644
--- a/src/runtime/SchedulerFactory.cpp
+++ b/src/runtime/SchedulerFactory.cpp

@@ -48,7 +48,7 @@
 
 std::unique_ptr<IScheduler> SchedulerFactory::create(Type type)
 {
-    switch(type)
+    switch (type)
     {
         case Type::ST:
         {

diff --git a/src/runtime/SchedulerUtils.cpp b/src/runtime/SchedulerUtils.cpp
index 6f9a32c..74ee539 100644
--- a/src/runtime/SchedulerUtils.cpp
+++ b/src/runtime/SchedulerUtils.cpp

@@ -47,35 +47,34 @@
     double ratio = m / static_cast<double>(n);
 
     // nt = sqrt(max_threads * (m / n) )
-    const unsigned adjusted = std::round(
-                                  std::sqrt(max_threads * ratio));
+    const unsigned adjusted = std::round(std::sqrt(max_threads * ratio));
 
     //find the nearest factor of max_threads
-    for(unsigned i = 0; i != adjusted; ++i)
+    for (unsigned i = 0; i != adjusted; ++i)
     {
         //try down
         const unsigned adj_down = adjusted - i;
-        if(max_threads % adj_down == 0)
+        if (max_threads % adj_down == 0)
         {
-            return { adj_down, max_threads / adj_down };
+            return {adj_down, max_threads / adj_down};
         }
 
         //try up
         const unsigned adj_up = adjusted + i;
-        if(max_threads % adj_up == 0)
+        if (max_threads % adj_up == 0)
         {
-            return { adj_up, max_threads / adj_up };
+            return {adj_up, max_threads / adj_up};
         }
     }
 
     //we didn't find anything so lets bail out with maxes biased to the largest dimension
-    if(m > n)
+    if (m > n)
     {
-        return { std::min<unsigned>(m, max_threads), 1 };
+        return {std::min<unsigned>(m, max_threads), 1};
     }
     else
     {
-        return { 1, std::min<unsigned>(n, max_threads) };
+        return {1, std::min<unsigned>(n, max_threads)};
     }
 }
 #endif /* #ifndef BARE_METAL */

diff --git a/src/runtime/SubTensor.cpp b/src/runtime/SubTensor.cpp
index ae16c8b..f87256a 100644
--- a/src/runtime/SubTensor.cpp
+++ b/src/runtime/SubTensor.cpp

@@ -27,8 +27,7 @@
 
 using namespace arm_compute;
 
-SubTensor::SubTensor()
-    : _parent(nullptr), _info()
+SubTensor::SubTensor() : _parent(nullptr), _info()
 {
 }
 

diff --git a/src/runtime/Tensor.cpp b/src/runtime/Tensor.cpp
index 6dcef9f..f17e323 100644
--- a/src/runtime/Tensor.cpp
+++ b/src/runtime/Tensor.cpp

@@ -25,8 +25,7 @@
 
 namespace arm_compute
 {
-Tensor::Tensor(IRuntimeContext *)
-    : _allocator(this)
+Tensor::Tensor(IRuntimeContext *) : _allocator(this)
 {
 }
 

diff --git a/src/runtime/TensorAllocator.cpp b/src/runtime/TensorAllocator.cpp
index 4ae27c5..372852b 100644
--- a/src/runtime/TensorAllocator.cpp
+++ b/src/runtime/TensorAllocator.cpp

@@ -43,13 +43,13 @@
     const size_t       parent_dims  = parent_info.num_dimensions();
     const size_t       child_dims   = child_info.num_dimensions();
 
-    if(child_dims <= parent_dims)
+    if (child_dims <= parent_dims)
     {
-        for(size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
+        for (size_t num_dimensions = child_dims; num_dimensions > 0; --num_dimensions)
         {
             const size_t child_dim_size = coords[num_dimensions - 1] + child_shape[num_dimensions - 1];
 
-            if((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
+            if ((coords[num_dimensions - 1] < 0) || (child_dim_size > parent_shape[num_dimensions - 1]))
             {
                 is_valid = false;
                 break;
@@ -65,8 +65,7 @@
 }
 } // namespace
 
-TensorAllocator::TensorAllocator(IMemoryManageable *owner)
-    : _owner(owner), _associated_memory_group(nullptr), _memory()
+TensorAllocator::TensorAllocator(IMemoryManageable *owner) : _owner(owner), _associated_memory_group(nullptr), _memory()
 {
 }
 
@@ -88,7 +87,7 @@
 
 TensorAllocator &TensorAllocator::operator=(TensorAllocator &&o) noexcept
 {
-    if(&o != this)
+    if (&o != this)
     {
         _owner   = o._owner;
         o._owner = nullptr;
@@ -117,8 +116,10 @@
     _memory = Memory(allocator._memory.region());
 
     // Init tensor info with new dimensions
-    size_t total_size = parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
-    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(), parent_info.offset_element_in_bytes(coords), total_size);
+    size_t total_size =
+        parent_info.offset_element_in_bytes(coords) + sub_info.total_size() - sub_info.offset_first_element_in_bytes();
+    sub_info.init(sub_info.tensor_shape(), sub_info.format(), parent_info.strides_in_bytes(),
+                  parent_info.offset_element_in_bytes(coords), total_size);
 
     // Set TensorInfo
     init(sub_info);
@@ -133,7 +134,7 @@
 {
     // Align to 64-byte boundaries by default if alignment is not specified
     const size_t alignment_to_use = (alignment() != 0) ? alignment() : 64;
-    if(_associated_memory_group == nullptr)
+    if (_associated_memory_group == nullptr)
     {
         _memory.set_owned_region(std::make_unique<MemoryRegion>(info().total_size(), alignment_to_use));
     }

diff --git a/src/runtime/Utils.cpp b/src/runtime/Utils.cpp
index 15e9d43..a7f7b5f 100644
--- a/src/runtime/Utils.cpp
+++ b/src/runtime/Utils.cpp

@@ -41,20 +41,17 @@
 
 const std::string &string_from_scheduler_type(Scheduler::Type t)
 {
-    static std::map<Scheduler::Type, const std::string> scheduler_type_map =
-    {
-        { Scheduler::Type::ST, "Single Thread" },
-        { Scheduler::Type::CPP, "C++11 Threads" },
-        { Scheduler::Type::OMP, "OpenMP Threads" },
-        { Scheduler::Type::CUSTOM, "Custom" }
-    };
+    static std::map<Scheduler::Type, const std::string> scheduler_type_map = {{Scheduler::Type::ST, "Single Thread"},
+                                                                              {Scheduler::Type::CPP, "C++11 Threads"},
+                                                                              {Scheduler::Type::OMP, "OpenMP Threads"},
+                                                                              {Scheduler::Type::CUSTOM, "Custom"}};
 
     return scheduler_type_map[t];
 }
 
 void schedule_kernel_on_ctx(IRuntimeContext *ctx, ICPPKernel *kernel, const IScheduler::Hints &hints)
 {
-    if(ctx)
+    if (ctx)
     {
         ARM_COMPUTE_ERROR_ON(ctx->scheduler() == nullptr);
         ctx->scheduler()->schedule(kernel, hints);
@@ -68,7 +65,7 @@
 unsigned int calculate_number_of_stages_only_x_axis(size_t input_x_dimension, unsigned int axis)
 {
     // We need only 1 stage for all axis except x-axis
-    if(axis != 0)
+    if (axis != 0)
     {
         return 1;
     }

diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
index 1bfb812..aba3287 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -37,25 +38,27 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu)
-    : IClDirectConvKernelConfig(gpu)
+ClDirectConvDefaultConfigBifrost::ClDirectConvDefaultConfigBifrost(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDirectConvDefaultConfigBifrost::configure_G71_f32,
-                                                                          &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
-                                                                          &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDirectConvDefaultConfigBifrost::configure_G71_f32, &ClDirectConvDefaultConfigBifrost::configure_G71_f16,
+        &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClDirectConvDefaultConfigBifrost::configure_default_f32,
-                                                                              &ClDirectConvDefaultConfigBifrost::configure_default_f16,
-                                                                              &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_default(
+        &ClDirectConvDefaultConfigBifrost::configure_default_f32,
+        &ClDirectConvDefaultConfigBifrost::configure_default_f16, &ClDirectConvDefaultConfigBifrost::configure_G71_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
             func = configs_G71.get_function(src->data_type());
@@ -69,18 +72,20 @@
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 2;
         }
@@ -93,18 +98,20 @@
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -117,18 +124,20 @@
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_G71_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -141,18 +150,20 @@
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f32(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 2;
         }
@@ -165,18 +176,20 @@
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigBifrost::configure_default_f16(const ITensorInfo   *src,
+                                                                                    const ITensorInfo   *wei,
+                                                                                    const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -188,5 +201,5 @@
 
     return desc;
 }
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute

diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
index 6b60b2c..ed6a4c3 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigBifrost.h

@@ -41,15 +41,21 @@
     ClDirectConvDefaultConfigBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G71_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_default_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGBIFROST */

diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
index 8f2fd82..4b7666d 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.cpp

@@ -29,6 +29,7 @@
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
+
 #include <utility>
 
 namespace arm_compute
@@ -37,25 +38,27 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu)
-    : IClDirectConvKernelConfig(gpu)
+ClDirectConvDefaultConfigValhall::ClDirectConvDefaultConfigValhall(GPUTarget gpu) : IClDirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(const ITensorInfo * src, const ITensorInfo * wei, const PadStrideInfo & conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClDirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDirectConvDefaultConfigValhall::configure_G78_f32,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_f16,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDirectConvDefaultConfigValhall::configure_G78_f32, &ClDirectConvDefaultConfigValhall::configure_G78_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
 
-    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(&ClDirectConvDefaultConfigValhall::configure_G57_f32,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G57_f16,
-                                                                          &ClDirectConvDefaultConfigValhall::configure_G78_u8);
+    ClDirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G57(
+        &ClDirectConvDefaultConfigValhall::configure_G57_f32, &ClDirectConvDefaultConfigValhall::configure_G57_f16,
+        &ClDirectConvDefaultConfigValhall::configure_G78_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G57:
             func = configs_G57.get_function(src->data_type());
@@ -70,15 +73,17 @@
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -87,11 +92,11 @@
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
-                if(ofm == 4)
+                if (ofm == 4)
                 {
                     desc.m0 = 1;
                     desc.n0 = 4;
@@ -113,7 +118,7 @@
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
@@ -131,15 +136,17 @@
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -149,15 +156,15 @@
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
             // k0 should be as larger as possible. However, we should avoid
             // having left-over for loops that make the implementation slower.
-            if((k % 16) == 0)
+            if ((k % 16) == 0)
             {
                 desc.k0 = 16;
             }
-            else if((k % 8) == 0)
+            else if ((k % 8) == 0)
             {
                 desc.k0 = 8;
             }
@@ -166,9 +173,9 @@
                 desc.k0 = 4;
             }
 
-            if(is_pointwise)
+            if (is_pointwise)
             {
-                if(ofm == 4)
+                if (ofm == 4)
                 {
                     desc.m0 = 1;
                     desc.n0 = 4;
@@ -187,15 +194,15 @@
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
-                if((k % 16) == 0)
+                if ((k % 16) == 0)
                 {
                     desc.k0 = 16;
                 }
-                else if((k % 8) == 0)
+                else if ((k % 8) == 0)
                 {
                     desc.k0 = 8;
                 }
@@ -206,9 +213,9 @@
             }
             else
             {
-                if(ofm >= 16)
+                if (ofm >= 16)
                 {
-                    if(m / 6 > 24000)
+                    if (m / 6 > 24000)
                     {
                         desc.m0 = 6;
                     }
@@ -223,11 +230,11 @@
                 {
                     desc.m0 = 2;
                     desc.n0 = 8;
-                    if((k % 16) == 0)
+                    if ((k % 16) == 0)
                     {
                         desc.k0 = 16;
                     }
-                    else if((k % 8) == 0)
+                    else if ((k % 8) == 0)
                     {
                         desc.k0 = 8;
                     }
@@ -243,18 +250,20 @@
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                               const ITensorInfo   *wei,
+                                                                               const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
         TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
 
         desc.n0 = 4;
 
-        if(output_shape[0] > 16)
+        if (output_shape[0] > 16)
         {
             desc.m0 = 4;
         }
@@ -267,15 +276,17 @@
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f32(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t m            = dst_shape[1] * dst_shape[2];
@@ -283,9 +294,9 @@
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
                 desc.m0 = 1;
                 desc.n0 = 1;
@@ -300,9 +311,9 @@
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
-                if(m == 1)
+                if (m == 1)
                 {
                     desc.m0 = 1;
                     desc.n0 = 1;
@@ -327,15 +338,17 @@
     return desc;
 }
 
-DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClDirectConvDefaultConfigValhall::configure_G57_f16(const ITensorInfo   *src,
+                                                                                const ITensorInfo   *wei,
+                                                                                const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Get the output shape
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
         const int32_t ofm          = dst_shape[0];
@@ -344,9 +357,9 @@
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(dst_shape[0] <= 4)
+        if (dst_shape[0] <= 4)
         {
-            if(is_pointwise)
+            if (is_pointwise)
             {
                 desc.m0 = 2;
                 desc.n0 = 1;
@@ -361,9 +374,9 @@
         }
         else
         {
-            if(m < 64)
+            if (m < 64)
             {
-                if(m == 1)
+                if (m == 1)
                 {
                     desc.m0 = 1;
                     desc.n0 = 1;
@@ -378,7 +391,7 @@
             }
             else
             {
-                if(ofm > 16)
+                if (ofm > 16)
                 {
                     desc.m0 = 4;
                     desc.n0 = 8;
@@ -396,5 +409,5 @@
 
     return desc;
 }
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute

diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
index f9d5c52..efd879a 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvDefaultConfigValhall.h

@@ -41,15 +41,21 @@
     ClDirectConvDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G57_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVDEFAULTCONFIGVALHALL */

diff --git a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
index 232167f..2c2509f 100644
--- a/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h
+++ b/src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h

@@ -46,7 +46,7 @@
      */
     static std::unique_ptr<IClDirectConvKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 return std::make_unique<ClDirectConvDefaultConfigBifrost>(GPUTarget::G71);
@@ -59,6 +59,6 @@
         }
     }
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_CLDIRECTCONVKERNELCONFIG */

diff --git a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
index 6104d73..e5b270c 100644
--- a/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h
+++ b/src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h

@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -52,8 +53,7 @@
      * @param[in] func_int8 Function to call for direct convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClDirectConvConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -65,7 +65,7 @@
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -92,8 +92,7 @@
      *
      * @param[in] arch GPU target
      */
-    IClDirectConvKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClDirectConvKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDirectConvKernelConfig);
@@ -105,11 +104,12 @@
      * @param[in] wei       Weights tensor
      * @param[in] conv_info Convolution info
      */
-    virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
 
 protected:
     GPUTarget _target;
 };
-} // namespace opencl
+} // namespace cl_direct_conv
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_DIRECT_CONV_ICLDIRECTCONVKERNELCONFIG */

diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
index 5311fdc..98ebf3e 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.cpp

@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h"
-#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
@@ -30,28 +29,34 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
 namespace arm_compute
 {
 namespace cl_dwc
 {
 namespace
 {
-DWCComputeKernelInfo configure_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier, bool is_g71)
+DWCComputeKernelInfo configure_f32(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
 
         desc.export_input_to_cl_image = false;
 
-        if(is_g71)
+        if (is_g71)
         {
             desc.export_weights_to_cl_image = false;
         }
@@ -60,17 +65,17 @@
             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
         }
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
             desc.n0 = 4;
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -81,14 +86,15 @@
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
@@ -106,16 +112,20 @@
     return desc;
 }
 
-DWCComputeKernelInfo configure_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier, bool is_g71)
+DWCComputeKernelInfo configure_f16(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier,
+                                   bool                 is_g71)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Src and weights have the same dimension indices
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape src_shape = src->tensor_shape();
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      src_w     = src_shape[idx_w];
@@ -124,7 +134,7 @@
 
         desc.export_input_to_cl_image = false;
 
-        if(is_g71)
+        if (is_g71)
         {
             desc.export_weights_to_cl_image = false;
         }
@@ -133,9 +143,9 @@
             desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
         }
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -146,11 +156,11 @@
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -161,20 +171,21 @@
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
             else
             {
-                if((src_w % 5) == 0)
+                if ((src_w % 5) == 0)
                 {
                     desc.m0 = 5;
                 }
@@ -194,27 +205,30 @@
 }
 } // namespace
 
-ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu)
-    : IClDWCNativeKernelConfig(gpu)
+ClDWCNativeDefaultConfigBifrost::ClDWCNativeDefaultConfigBifrost(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
 {
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
 {
-    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                   unsigned int depth_multiplier);
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigBifrost::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClDWCNativeDefaultConfigBifrost::configure_G71_f32,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(
+        &ClDWCNativeDefaultConfigBifrost::configure_G71_f32, &ClDWCNativeDefaultConfigBifrost::configure_G71_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClDWCNativeDefaultConfigBifrost::configure_G7x_f32,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
-                                                                         &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_f32, &ClDWCNativeDefaultConfigBifrost::configure_G7x_f16,
+        &ClDWCNativeDefaultConfigBifrost::configure_G7x_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G71:
             func = configs_G71.get_function(src->data_type());
@@ -228,43 +242,58 @@
     return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, true);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G71_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, true);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f32(src, wei, conv_info, dilation, depth_multiplier, false);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     return configure_f16(src, wei, conv_info, dilation, depth_multiplier, false);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigBifrost::configure_G7x_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
 {
     ARM_COMPUTE_UNUSED(wei);
 
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = false;
         desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
-        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
         {
             desc.m0 = 2;
         }

diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
index cec2cae..41d86c9 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigBifrost.h

@@ -41,20 +41,38 @@
     ClDWCNativeDefaultConfigBifrost(GPUTarget gpu);
 
     // Inherited overridden method
-    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier) override;
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
 
 private:
-    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                          unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G71_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G7x_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
 };
 } // namespace cl_dwc
 } // namespace arm_compute

diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
index 51f3787..ef1bb38 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.cpp

@@ -22,7 +22,6 @@
  * SOFTWARE.
  */
 #include "src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h"
-#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
 
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/GPUTarget.h"
@@ -30,31 +29,36 @@
 #include "arm_compute/core/TensorShape.h"
 #include "arm_compute/core/utils/helpers/AdjustVecSize.h"
 
+#include "src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.h"
+
 namespace arm_compute
 {
 namespace cl_dwc
 {
-ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu)
-    : IClDWCNativeKernelConfig(gpu)
+ClDWCNativeDefaultConfigValhall::ClDWCNativeDefaultConfigValhall(GPUTarget gpu) : IClDWCNativeKernelConfig(gpu)
 {
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                const ITensorInfo   *wei,
+                                                                const PadStrideInfo &conv_info,
+                                                                const Size2D        &dilation,
+                                                                unsigned int         depth_multiplier)
 {
-    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                   unsigned int depth_multiplier);
+    using ConfigurationFunctionExecutorPtr = DWCComputeKernelInfo (ClDWCNativeDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
+        unsigned int depth_multiplier);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G78_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
 
-    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClDWCNativeDefaultConfigValhall::configure_G78_f32,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
-                                                                         &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
+    ClDWCNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClDWCNativeDefaultConfigValhall::configure_G78_f32, &ClDWCNativeDefaultConfigValhall::configure_G77_f16,
+        &ClDWCNativeDefaultConfigValhall::configure_G78_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G77:
             func = configs_G77.get_function(src->data_type());
@@ -69,15 +73,18 @@
     return (this->*func)(src, wei, conv_info, dilation, depth_multiplier);
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f32(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
@@ -85,17 +92,17 @@
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
             desc.n0 = 4;
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -106,14 +113,15 @@
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
@@ -131,16 +139,19 @@
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         // Src and weights have the same dimension indices
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape src_shape = src->tensor_shape();
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      src_w     = src_shape[idx_w];
@@ -150,9 +161,9 @@
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -163,11 +174,11 @@
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -178,20 +189,21 @@
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }
             else
             {
-                if((src_w % 5) == 0)
+                if ((src_w % 5) == 0)
                 {
                     desc.m0 = 5;
                 }
@@ -210,19 +222,22 @@
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                       unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G78_u8(const ITensorInfo   *src,
+                                                                       const ITensorInfo   *wei,
+                                                                       const PadStrideInfo &conv_info,
+                                                                       const Size2D        &dilation,
+                                                                       unsigned int         depth_multiplier)
 {
     ARM_COMPUTE_UNUSED(wei);
 
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = false;
         desc.n0                         = (depth_multiplier == 1) ? 4 : 1;
-        if(conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1 && depth_multiplier == 1)
         {
             desc.m0 = 2;
         }
@@ -235,15 +250,18 @@
     return desc;
 }
 
-DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                                                        unsigned int depth_multiplier)
+DWCComputeKernelInfo ClDWCNativeDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                        const ITensorInfo   *wei,
+                                                                        const PadStrideInfo &conv_info,
+                                                                        const Size2D        &dilation,
+                                                                        unsigned int         depth_multiplier)
 {
     DWCComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const size_t idx_c          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
-        const size_t idx_w          = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
+        const size_t      idx_c     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::CHANNEL);
+        const size_t      idx_w     = get_data_layout_dimension_index(wei->data_layout(), DataLayoutDimension::WIDTH);
         const TensorShape wei_shape = wei->tensor_shape();
         const size_t      kernel_c  = wei_shape[idx_c];
         const size_t      kernel_w  = wei_shape[idx_w];
@@ -251,9 +269,9 @@
         desc.export_input_to_cl_image   = false;
         desc.export_weights_to_cl_image = use_cl_image_for_weights(wei, depth_multiplier);
 
-        if(depth_multiplier == 1)
+        if (depth_multiplier == 1)
         {
-            if(desc.export_weights_to_cl_image == false)
+            if (desc.export_weights_to_cl_image == false)
             {
                 desc.n0 = 8;
             }
@@ -264,11 +282,11 @@
         }
         else
         {
-            if((depth_multiplier % 4) == 0)
+            if ((depth_multiplier % 4) == 0)
             {
                 desc.n0 = 4;
             }
-            else if((depth_multiplier % 2) == 0)
+            else if ((depth_multiplier % 2) == 0)
             {
                 desc.n0 = 2;
             }
@@ -279,14 +297,15 @@
         }
 
         // Note: If we reduce n0, export to cl_image must be false
-        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) && (desc.export_weights_to_cl_image == true));
+        ARM_COMPUTE_ERROR_ON((adjust_vec_size(desc.n0, kernel_c) != desc.n0) &&
+                             (desc.export_weights_to_cl_image == true));
 
         desc.n0 = adjust_vec_size(desc.n0, kernel_c);
 
         // Set m0 only if stride_x == 1 and dilation_x == 1
-        if(conv_info.stride().first == 1 && dilation.x() == 1)
+        if (conv_info.stride().first == 1 && dilation.x() == 1)
         {
-            if((kernel_w >= 9) || (kernel_w == 1))
+            if ((kernel_w >= 9) || (kernel_w == 1))
             {
                 desc.m0 = 1;
             }

diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
index 4d51fa6..fabce77 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeDefaultConfigValhall.h

@@ -41,18 +41,33 @@
     ClDWCNativeDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                   unsigned int depth_multiplier) override;
+    DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                   const ITensorInfo   *wei,
+                                   const PadStrideInfo &conv_info,
+                                   const Size2D        &dilation,
+                                   unsigned int         depth_multiplier) override;
 
 private:
-    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                          unsigned int depth_multiplier);
-    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f32(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G78_u8(const ITensorInfo   *src,
+                                          const ITensorInfo   *wei,
+                                          const PadStrideInfo &conv_info,
+                                          const Size2D        &dilation,
+                                          unsigned int         depth_multiplier);
+    DWCComputeKernelInfo configure_G77_f16(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier);
 };
 } // namespace cl_dwc
 } // namespace arm_compute

diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
index 5593c6d..c8b006c 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeHeuristicsHelpers.cpp

@@ -32,7 +32,7 @@
 bool use_cl_image_for_weights(const ITensorInfo *weights, unsigned int depth_multiplier)
 {
     // Check whether we can use the cl image with the weights.
-    if(!export_to_cl_image(weights))
+    if (!export_to_cl_image(weights))
     {
         return false;
     }
@@ -45,12 +45,12 @@
     // If we can use the cl image storage with the weights, we prefer to use the cl buffer storage in the following cases for performance reasons:
     // 1- When the kernel size is 1x1
     // 2- When the depth multiplier is greater than 1 and not multiple of 4.
-    if((kernel_w == 1) && (kernel_h == 1))
+    if ((kernel_w == 1) && (kernel_h == 1))
     {
         return false;
     }
 
-    if((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
+    if ((depth_multiplier > 1) && (depth_multiplier % 4) != 0)
     {
         return false;
     }
@@ -58,4 +58,4 @@
     return true;
 }
 } // namespace cl_dwc
-} // namespace arm_compute
\ No newline at end of file
+} // namespace arm_compute

diff --git a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
index c08053d..49ce6ff 100644
--- a/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h
+++ b/src/runtime/heuristics/dwc_native/ClDWCNativeKernelConfig.h

@@ -46,7 +46,7 @@
      */
     static std::unique_ptr<IClDWCNativeKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
                 // The heuristic for Midgard is the same as the one used for Arm Mali-G71

diff --git a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
index b5df132..614a662 100644
--- a/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h
+++ b/src/runtime/heuristics/dwc_native/IClDWCNativeKernelConfig.h

@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -52,8 +53,7 @@
      * @param[in] func_int8 Function to call for depthwise convolution Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClDWCNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -65,7 +65,7 @@
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -92,8 +92,7 @@
      *
      * @param[in] arch GPU target
      */
-    IClDWCNativeKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClDWCNativeKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClDWCNativeKernelConfig);
@@ -107,8 +106,11 @@
      * @param[in] dilation         Kernel dilation
      * @param[in] depth_multiplier Output feature maps multiplier
      */
-    virtual DWCComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info, const Size2D &dilation,
-                                           unsigned int depth_multiplier) = 0;
+    virtual DWCComputeKernelInfo configure(const ITensorInfo   *src,
+                                           const ITensorInfo   *wei,
+                                           const PadStrideInfo &conv_info,
+                                           const Size2D        &dilation,
+                                           unsigned int         depth_multiplier) = 0;
 
 protected:
     GPUTarget _target;

diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
index 990f050..3380d8f 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.cpp

@@ -35,17 +35,19 @@
 {
 using namespace arm_compute::misc::shape_calculator;
 
-ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu)
-    : IClIndirectConvKernelConfig(gpu)
+ClIndirectConvDefaultConfigValhall::ClIndirectConvDefaultConfigValhall(GPUTarget gpu) : IClIndirectConvKernelConfig(gpu)
 {
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure(const ITensorInfo   *src,
+                                                                          const ITensorInfo   *wei,
+                                                                          const PadStrideInfo &conv_info)
 {
-    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    using ConfigurationFunctionExecutorPtr = DirectConvComputeKernelInfo (ClIndirectConvDefaultConfigValhall::*)(
+        const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 
-    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClIndirectConvDefaultConfigValhall::configure_G77_f32,
-                                                                            &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
+    ClIndirectConvConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(
+        &ClIndirectConvDefaultConfigValhall::configure_G77_f32, &ClIndirectConvDefaultConfigValhall::configure_G77_f16);
 
     // Important note: Indirect convolution should not be used when the kernel size is 1x1 (pointwise). The reason is because the indirect buffer makes
     // indirect convolution less efficient than direct convolution or gemm. For this reason, the heuristic of indirect convolution has not been tuned
@@ -57,22 +59,24 @@
     return (this->*func)(src, wei, conv_info);
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f32(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
-        const int32_t stride_x     = conv_info.stride().first;
-        const int32_t stride_y     = conv_info.stride().second;
-        const int32_t ofm          = dst_shape[0];
-        const int32_t m            = (dst_shape[1]/ stride_x) * (dst_shape[2] / stride_y);
+        const int32_t     stride_x                   = conv_info.stride().first;
+        const int32_t     stride_y                   = conv_info.stride().second;
+        const int32_t     ofm                        = dst_shape[0];
+        const int32_t     m                          = (dst_shape[1] / stride_x) * (dst_shape[2] / stride_y);
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(ofm <= 4)
+        if (ofm <= 4)
         {
             desc.m0 = 1;
             desc.n0 = 2;
@@ -82,7 +86,7 @@
         {
             // The 16000 threshold value has been identified as the right
             // one for using the biggest block size allowed on F32: 5x4x4
-            if(m < 16000)
+            if (m < 16000)
             {
                 desc.m0 = 4;
                 desc.n0 = 4;
@@ -100,31 +104,33 @@
     return desc;
 }
 
-DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info)
+DirectConvComputeKernelInfo ClIndirectConvDefaultConfigValhall::configure_G77_f16(const ITensorInfo   *src,
+                                                                                  const ITensorInfo   *wei,
+                                                                                  const PadStrideInfo &conv_info)
 {
     DirectConvComputeKernelInfo desc;
 
-    if(src->data_layout() == DataLayout::NHWC)
+    if (src->data_layout() == DataLayout::NHWC)
     {
-        const TensorShape wei_shape                  = wei->tensor_shape();
-        const TensorShape dst_shape                  = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
+        const TensorShape wei_shape = wei->tensor_shape();
+        const TensorShape dst_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *wei, conv_info);
         const bool        export_weights_to_cl_image = export_to_cl_image(wei);
 
-        const int32_t ofm          = dst_shape[0];
-        const int32_t m            = dst_shape[1] * dst_shape[2];
-        const int32_t k            = wei_shape[0];
+        const int32_t ofm = dst_shape[0];
+        const int32_t m   = dst_shape[1] * dst_shape[2];
+        const int32_t k   = wei_shape[0];
 
         desc.export_weights_to_cl_image = export_weights_to_cl_image;
 
-        if(ofm <= 4)
+        if (ofm <= 4)
         {
             // k0 should be as larger as possible. However, we should avoid
             // having left-over for loops that make the implementation slower.
-            if((k % 16) == 0)
+            if ((k % 16) == 0)
             {
                 desc.k0 = 16;
             }
-            else if((k % 8) == 0)
+            else if ((k % 8) == 0)
             {
                 desc.k0 = 8;
             }
@@ -140,11 +146,11 @@
         {
             // The 16000 threshold value has been identified as the right
             // one for using the biggest block size allowed on F16: 8x4
-            if(m >= 16000 && k < 4)
+            if (m >= 16000 && k < 4)
             {
                 desc.m0 = 8;
                 desc.n0 = 4;
-                desc.k0 = 4;    // k0 is clamped to k inside the kernel when k is less than 4
+                desc.k0 = 4; // k0 is clamped to k inside the kernel when k is less than 4
             }
             else
             {

diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
index 68dca91..bab808c 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvDefaultConfigValhall.h

@@ -41,11 +41,14 @@
     ClIndirectConvDefaultConfigValhall(GPUTarget gpu);
 
     // Inherited overridden method
-    DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
+    DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) override;
 
 private:
-    DirectConvComputeKernelInfo configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
-    DirectConvComputeKernelInfo configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f32(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
+    DirectConvComputeKernelInfo
+    configure_G77_f16(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info);
 };
 } // namespace cl_indirect_conv
 } // namespace arm_compute

diff --git a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
index 73fbb87..dd614e1 100644
--- a/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h

@@ -45,7 +45,7 @@
      */
     static std::unique_ptr<IClIndirectConvKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:

diff --git a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
index d2f4cde..d05da18 100644
--- a/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h
+++ b/src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h

@@ -27,6 +27,7 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -49,8 +50,7 @@
      * @param[in] func_f16 Function to call for indirect convolution F16
      *
      */
-    ClIndirectConvConfigArray(T func_f32, T func_f16)
-        : _configs{ func_f32, func_f16}
+    ClIndirectConvConfigArray(T func_f32, T func_f16) : _configs{func_f32, func_f16}
     {
     }
 
@@ -62,7 +62,7 @@
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -85,8 +85,7 @@
      *
      * @param[in] arch GPU target
      */
-    IClIndirectConvKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClIndirectConvKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClIndirectConvKernelConfig);
@@ -98,7 +97,8 @@
      * @param[in] wei       Weights tensor
      * @param[in] conv_info Convolution info
      */
-    virtual DirectConvComputeKernelInfo configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
+    virtual DirectConvComputeKernelInfo
+    configure(const ITensorInfo *src, const ITensorInfo *wei, const PadStrideInfo &conv_info) = 0;
 
 protected:
     GPUTarget _target;

diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
index 01102b3..b3c8d89 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.cpp

@@ -28,30 +28,33 @@
 #include "arm_compute/core/GPUTarget.h"
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
-#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
-#include <utility>
 
+#include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 #include "src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h"
 
+#include <utility>
+
 namespace arm_compute
 {
 namespace cl_matmul
 {
-ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu)
-    : IClMatMulNativeKernelConfig(gpu)
+ClMatMulNativeDefaultConfigValhall::ClMatMulNativeDefaultConfigValhall(GPUTarget gpu) : IClMatMulNativeKernelConfig(gpu)
 {
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
+MatMulKernelInfo
+ClMatMulNativeDefaultConfigValhall::configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info)
 {
-    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo & info);
+    using ConfigurationFunctionExecutorPtr = MatMulKernelInfo (ClMatMulNativeDefaultConfigValhall::*)(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
 
-    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
-                                                                             &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
-                                                                             &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
+    ClMatMulNativeConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f32,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_f16,
+        &ClMatMulNativeDefaultConfigValhall::configure_G710_u8);
 
     ConfigurationFunctionExecutorPtr func = nullptr;
-    switch(_target)
+    switch (_target)
     {
         case GPUTarget::G710:
         default:
@@ -67,7 +70,7 @@
 
     const bool is_batched = lhs_shape.num_dimensions() > 2;
 
-    if(is_batched == true)
+    if (is_batched == true)
     {
         lhs_shape.collapse_from(2);
     }
@@ -81,103 +84,48 @@
     return (this->*func)(m, n, k, b, rhs->lock_paddings(), info);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f32(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 2, 8, 4, 1 },
-        { 24, 464, 412, 24, 2, 8, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 2, 4, 16, 1 },
-        { 1568, 64, 40, 36, 2, 8, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 4, 16, 1},
+        {1568, 64, 40, 36, 2, 8, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 5, 4, 4, 0 },
-        { 24, 464, 412, 24, 6, 2, 8, 0 },
-        { 112, 184, 144, 28, 6, 4, 4, 0 },
-        { 5776, 64, 32, 36, 5, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 8, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 1 },
-        { 4096, 48, 32, 36, 2, 2, 16, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 6, 2, 8, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1}, {4096, 48, 32, 36, 2, 2, 16, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 8, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1}, {2920, 64, 64, 24, 4, 4, 4, 1}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t =
-    {
-        { 3136, 64, 64, 36, 5, 4, 4, 0 },
-        { 4096, 48, 32, 36, 5, 4, 4, 0 },
-        { 688, 92, 68, 32, 5, 4, 4, 0 },
-        { 24, 464, 412, 24, 6, 2, 4, 0 },
-        { 112, 184, 144, 28, 5, 4, 4, 0 },
-        { 5776, 64, 32, 36, 5, 4, 4, 0 },
-        { 1568, 64, 40, 36, 5, 4, 4, 0 },
-        { 2920, 64, 64, 24, 6, 2, 4, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 5, 4, 4, 0}, {4096, 48, 32, 36, 5, 4, 4, 0},  {688, 92, 68, 32, 5, 4, 4, 0},
+        {24, 464, 412, 24, 6, 2, 4, 0}, {112, 184, 144, 28, 5, 4, 4, 0}, {5776, 64, 32, 36, 5, 4, 4, 0},
+        {1568, 64, 40, 36, 5, 4, 4, 0}, {2920, 64, 64, 24, 6, 2, 4, 0}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 2, 8, 4, 1 },
-        { 24, 464, 412, 24, 2, 8, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 2, 8, 8, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},   {688, 92, 68, 32, 2, 8, 4, 1},
+        {24, 464, 412, 24, 2, 8, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 2, 8, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 2, 2, 16, 1 },
-        { 112, 184, 144, 28, 4, 4, 4, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 4, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 1},  {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 2, 2, 16, 1}, {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 2, 8, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 2, 8, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
@@ -185,17 +133,17 @@
     const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
     const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_t;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_t_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
@@ -209,108 +157,51 @@
     MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
     MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
 
-    return select_info(desc0,
-                       desc1,
-                       m, n, k, b, DataType::F32, rhs_lock_padding);
+    return select_info(desc0, desc1, m, n, k, b, DataType::F32, rhs_lock_padding);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_f16(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 16, 1 },
-        { 24, 464, 412, 24, 4, 4, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 8, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 16, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 16, 1}, {5776, 64, 32, 36, 4, 4, 8, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt =
-    {
-        { 3136, 64, 64, 36, 6, 4, 8, 0 },
-        { 4096, 48, 32, 36, 6, 4, 8, 0 },
-        { 688, 92, 68, 32, 6, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 0 },
-        { 112, 184, 144, 28, 6, 4, 8, 0 },
-        { 5776, 64, 32, 36, 6, 4, 8, 0 },
-        { 1568, 64, 40, 36, 6, 4, 8, 0 },
-        { 2920, 64, 64, 24, 6, 4, 8, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 8, 0}, {4096, 48, 32, 36, 6, 4, 8, 0},  {688, 92, 68, 32, 6, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 6, 4, 8, 0}, {5776, 64, 32, 36, 6, 4, 8, 0},
+        {1568, 64, 40, 36, 6, 4, 8, 0}, {2920, 64, 64, 24, 6, 4, 8, 0}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 6, 4, 8, 1 },
-        { 4096, 48, 32, 36, 6, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 6, 2, 4, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 6, 4, 8, 1 },
-        { 1568, 64, 40, 36, 6, 4, 8, 1 },
-        { 2920, 64, 64, 24, 6, 4, 8, 1 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 6, 4, 8, 1}, {4096, 48, 32, 36, 6, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 6, 2, 4, 1}, {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 6, 4, 8, 1},
+        {1568, 64, 40, 36, 6, 4, 8, 1}, {2920, 64, 64, 24, 6, 4, 8, 1}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t =
-    {
-        { 3136, 64, 64, 36, 6, 2, 16, 0 },
-        { 4096, 48, 32, 36, 5, 4, 8, 0 },
-        { 688, 92, 68, 32, 6, 2, 16, 0 },
-        { 24, 464, 412, 24, 6, 2, 16, 0 },
-        { 112, 184, 144, 28, 6, 2, 16, 0 },
-        { 5776, 64, 32, 36, 5, 4, 8, 0 },
-        { 1568, 64, 40, 36, 5, 4, 8, 0 },
-        { 2920, 64, 64, 24, 6, 2, 16, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_nt_t = {
+        {3136, 64, 64, 36, 6, 2, 16, 0}, {4096, 48, 32, 36, 5, 4, 8, 0},   {688, 92, 68, 32, 6, 2, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 6, 2, 16, 0}, {5776, 64, 32, 36, 5, 4, 8, 0},
+        {1568, 64, 40, 36, 5, 4, 8, 0},  {2920, 64, 64, 24, 6, 2, 16, 0}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 4, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 4, 4, 4, 1 },
-        { 112, 184, 144, 28, 4, 4, 4, 1 },
-        { 5776, 64, 32, 36, 4, 4, 4, 1 },
-        { 1568, 64, 40, 36, 4, 4, 4, 1 },
-        { 2920, 64, 64, 24, 4, 4, 4, 1 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 4, 1},  {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 4, 4, 1},  {112, 184, 144, 28, 4, 4, 4, 1}, {5776, 64, 32, 36, 4, 4, 4, 1},
+        {1568, 64, 40, 36, 4, 4, 4, 1},  {2920, 64, 64, 24, 4, 4, 4, 1}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 4, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 4, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 4, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 4, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 4, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 4, 0}, {2920, 64, 64, 24, 4, 4, 4, 0}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 1 },
-        { 4096, 48, 32, 36, 4, 4, 8, 1 },
-        { 688, 92, 68, 32, 4, 4, 4, 1 },
-        { 24, 464, 412, 24, 4, 2, 8, 1 },
-        { 112, 184, 144, 28, 4, 2, 16, 1 },
-        { 5776, 64, 32, 36, 4, 4, 16, 1 },
-        { 1568, 64, 40, 36, 4, 4, 8, 1 },
-        { 2920, 64, 64, 24, 4, 4, 16, 1 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 1}, {4096, 48, 32, 36, 4, 4, 8, 1},   {688, 92, 68, 32, 4, 4, 4, 1},
+        {24, 464, 412, 24, 4, 2, 8, 1},  {112, 184, 144, 28, 4, 2, 16, 1}, {5776, 64, 32, 36, 4, 4, 16, 1},
+        {1568, 64, 40, 36, 4, 4, 8, 1},  {2920, 64, 64, 24, 4, 4, 16, 1}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 4, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 4, 8, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_fallback_t_t = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 4, 8, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
@@ -318,17 +209,17 @@
     const MatMulNativeConfigsMatrix *configs_best_to_use     = nullptr;
     const MatMulNativeConfigsMatrix *configs_fallback_to_use = nullptr;
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_nt;
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         configs_best_to_use     = &configs_mnkb_best_nt_t;
         configs_fallback_to_use = &configs_mnkb_fallback_nt_t;
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         configs_best_to_use     = &configs_mnkb_best_t_nt;
         configs_fallback_to_use = &configs_mnkb_fallback_t_nt;
@@ -342,75 +233,46 @@
     MatMulKernelInfo desc0 = find_info(*configs_best_to_use, adj_lhs, adj_rhs, m, n, k, b);
     MatMulKernelInfo desc1 = find_info(*configs_fallback_to_use, adj_lhs, adj_rhs, m, n, k, b);
 
-    return select_info(desc0,
-                       desc1,
-                       m, n, k, b, DataType::F16, rhs_lock_padding);
+    return select_info(desc0, desc1, m, n, k, b, DataType::F16, rhs_lock_padding);
 }
 
-MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
+MatMulKernelInfo ClMatMulNativeDefaultConfigValhall::configure_G710_u8(
+    unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info)
 {
     ARM_COMPUTE_UNUSED(rhs_lock_padding);
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt =
-    {
-        { 3136, 64, 64, 36, 6, 4, 4, 0 },
-        { 4096, 48, 32, 36, 6, 4, 4, 0 },
-        { 688, 92, 68, 32, 2, 8, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 6, 4, 4, 0 },
-        { 5776, 64, 32, 36, 6, 4, 4, 0 },
-        { 1568, 64, 40, 36, 6, 4, 4, 0 },
-        { 2920, 64, 64, 24, 5, 4, 4, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_nt = {
+        {3136, 64, 64, 36, 6, 4, 4, 0}, {4096, 48, 32, 36, 6, 4, 4, 0},  {688, 92, 68, 32, 2, 8, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 6, 4, 4, 0}, {5776, 64, 32, 36, 6, 4, 4, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0}, {2920, 64, 64, 24, 5, 4, 4, 0}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t =
-    {
-        { 3136, 64, 64, 36, 4, 4, 16, 0 },
-        { 4096, 48, 32, 36, 4, 4, 16, 0 },
-        { 688, 92, 68, 32, 4, 4, 16, 0 },
-        { 24, 464, 412, 24, 6, 2, 16, 0 },
-        { 112, 184, 144, 28, 4, 4, 16, 0 },
-        { 5776, 64, 32, 36, 4, 4, 16, 0 },
-        { 1568, 64, 40, 36, 6, 4, 4, 0 },
-        { 2920, 64, 64, 24, 4, 4, 16, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_nt_t = {
+        {3136, 64, 64, 36, 4, 4, 16, 0}, {4096, 48, 32, 36, 4, 4, 16, 0},  {688, 92, 68, 32, 4, 4, 16, 0},
+        {24, 464, 412, 24, 6, 2, 16, 0}, {112, 184, 144, 28, 4, 4, 16, 0}, {5776, 64, 32, 36, 4, 4, 16, 0},
+        {1568, 64, 40, 36, 6, 4, 4, 0},  {2920, 64, 64, 24, 4, 4, 16, 0}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt =
-    {
-        { 3136, 64, 64, 36, 4, 4, 8, 0 },
-        { 4096, 48, 32, 36, 4, 4, 8, 0 },
-        { 688, 92, 68, 32, 4, 4, 4, 0 },
-        { 24, 464, 412, 24, 4, 4, 4, 0 },
-        { 112, 184, 144, 28, 4, 4, 8, 0 },
-        { 5776, 64, 32, 36, 4, 4, 8, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 4, 8, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_nt = {
+        {3136, 64, 64, 36, 4, 4, 8, 0}, {4096, 48, 32, 36, 4, 4, 8, 0},  {688, 92, 68, 32, 4, 4, 4, 0},
+        {24, 464, 412, 24, 4, 4, 4, 0}, {112, 184, 144, 28, 4, 4, 8, 0}, {5776, 64, 32, 36, 4, 4, 8, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0}, {2920, 64, 64, 24, 4, 4, 8, 0}};
 
-    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t =
-    {
-        { 3136, 64, 64, 36, 4, 2, 16, 0 },
-        { 4096, 48, 32, 36, 4, 4, 4, 0 },
-        { 688, 92, 68, 32, 4, 4, 8, 0 },
-        { 24, 464, 412, 24, 4, 2, 16, 0 },
-        { 112, 184, 144, 28, 4, 2, 16, 0 },
-        { 5776, 64, 32, 36, 4, 4, 4, 0 },
-        { 1568, 64, 40, 36, 4, 4, 8, 0 },
-        { 2920, 64, 64, 24, 4, 2, 16, 0 }
-    };
+    const MatMulNativeConfigsMatrix configs_mnkb_best_t_t = {
+        {3136, 64, 64, 36, 4, 2, 16, 0}, {4096, 48, 32, 36, 4, 4, 4, 0},   {688, 92, 68, 32, 4, 4, 8, 0},
+        {24, 464, 412, 24, 4, 2, 16, 0}, {112, 184, 144, 28, 4, 2, 16, 0}, {5776, 64, 32, 36, 4, 4, 4, 0},
+        {1568, 64, 40, 36, 4, 4, 8, 0},  {2920, 64, 64, 24, 4, 2, 16, 0}};
 
     const bool adj_lhs = info.adj_lhs();
     const bool adj_rhs = info.adj_rhs();
 
-    if((adj_lhs == false) && (adj_rhs == false))
+    if ((adj_lhs == false) && (adj_rhs == false))
     {
         return find_info(configs_mnkb_best_nt_nt, adj_lhs, adj_rhs, m, n, k, b);
     }
-    else if((adj_lhs == false) && (adj_rhs == true))
+    else if ((adj_lhs == false) && (adj_rhs == true))
     {
         return find_info(configs_mnkb_best_nt_t, adj_lhs, adj_rhs, m, n, k, b);
     }
-    else if((adj_lhs == true) && (adj_rhs == false))
+    else if ((adj_lhs == true) && (adj_rhs == false))
     {
         return find_info(configs_mnkb_best_t_nt, adj_lhs, adj_rhs, m, n, k, b);
     }
@@ -419,5 +281,5 @@
         return find_info(configs_mnkb_best_t_t, adj_lhs, adj_rhs, m, n, k, b);
     }
 }
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute

diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
index fe167d1..6b39db6 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeDefaultConfigValhall.h

@@ -44,10 +44,13 @@
     MatMulKernelInfo configure(const ITensorInfo *lhs, const ITensorInfo *rhs, const MatMulInfo &info) override;
 
 private:
-    MatMulKernelInfo configure_G710_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
-    MatMulKernelInfo configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
-    MatMulKernelInfo configure_G710_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f32(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_f16(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
+    MatMulKernelInfo configure_G710_u8(
+        unsigned int m, unsigned int n, unsigned int k, unsigned int b, bool rhs_lock_padding, const MatMulInfo &info);
 };
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEDEFAULTCONFIGVALHALL */

diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
index 1e06e84..89cad30 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.cpp

@@ -26,6 +26,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/core/TensorShape.h"
+
 #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h"
 
 #include <limits>
@@ -37,22 +38,32 @@
 {
 MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
                              const MatMulKernelInfo &info1,
-                             unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding)
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding)
 {
-    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true, "The fallback MatMul configuration cannot have export_to_cl_image = true");
-    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs, "The MatMul configurations must have the same adj_lhs value");
-    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs, "The MatMul configurations must have the same adj_rhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info1.export_rhs_to_cl_image == true,
+                             "The fallback MatMul configuration cannot have export_to_cl_image = true");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_lhs != info1.adj_lhs,
+                             "The MatMul configurations must have the same adj_lhs value");
+    ARM_COMPUTE_ERROR_ON_MSG(info0.adj_rhs != info1.adj_rhs,
+                             "The MatMul configurations must have the same adj_rhs value");
 
     const bool adj_lhs = info0.adj_lhs;
     const bool adj_rhs = info0.adj_rhs;
 
-    TensorInfo lhs_info = !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
-    TensorInfo rhs_info = !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
+    TensorInfo lhs_info =
+        !adj_lhs ? TensorInfo(TensorShape(k, m, b), 1, data_type) : TensorInfo(TensorShape(m, k, b), 1, data_type);
+    TensorInfo rhs_info =
+        !adj_rhs ? TensorInfo(TensorShape(n, k, b), 1, data_type) : TensorInfo(TensorShape(k, n, b), 1, data_type);
     TensorInfo dst_info;
 
-    if(rhs_lock_padding == false)
+    if (rhs_lock_padding == false)
     {
-        if(bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
+        if (bool(opencl::kernels::ClMatMulNativeKernel::validate(&lhs_info, &rhs_info, nullptr, &dst_info, info0)))
         {
             return info0;
         }
@@ -67,7 +78,13 @@
     }
 }
 
-MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b)
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b)
 {
     size_t min_acc = std::numeric_limits<size_t>::max();
     size_t min_idx = 0;
@@ -76,12 +93,13 @@
     const size_t num_rows = configs.size();
     const size_t num_cols = configs[0].size();
 
-    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U, "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
+    ARM_COMPUTE_ERROR_ON_MSG(num_cols != 8U,
+                             "The entry should have 8 integer values representing: M, N, K, B, M0, N0. K0, IMG_RHS");
     ARM_COMPUTE_UNUSED(num_cols);
 
     // Find nearest GeMM workload
     // Note: the workload does not depend on the K dimension
-    for(size_t y = 0; y < num_rows; ++y)
+    for (size_t y = 0; y < num_rows; ++y)
     {
         size_t mc0 = static_cast<size_t>(configs[y][0]);
         size_t nc0 = static_cast<size_t>(configs[y][1]);
@@ -94,7 +112,7 @@
         acc += (k - kc0) * (k - kc0);
         acc += (b - bc0) * (b - bc0);
         acc = std::sqrt(acc);
-        if(acc < min_acc)
+        if (acc < min_acc)
         {
             min_acc = acc;
             min_idx = y;

diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
index 3881617..a114fff 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeHelpers.h

@@ -52,7 +52,12 @@
  */
 MatMulKernelInfo select_info(const MatMulKernelInfo &info0,
                              const MatMulKernelInfo &info1,
-                             unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type, bool rhs_lock_padding);
+                             unsigned int            m,
+                             unsigned int            n,
+                             unsigned int            k,
+                             unsigned int            b,
+                             DataType                data_type,
+                             bool                    rhs_lock_padding);
 
 /** Find the preferred configurations for the MatMul Native kernel using the MatMulNativeConfigsMatrix provided by the user
  *
@@ -66,7 +71,13 @@
  *
  * @return @ref MatMulKernelInfo
  */
-MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs, bool adj_lhs, bool adj_rhs, unsigned int m, unsigned int n, unsigned int k, unsigned int b);
+MatMulKernelInfo find_info(const MatMulNativeConfigsMatrix &configs,
+                           bool                             adj_lhs,
+                           bool                             adj_rhs,
+                           unsigned int                     m,
+                           unsigned int                     n,
+                           unsigned int                     k,
+                           unsigned int                     b);
 } // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEHELPERS */

diff --git a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
index a2dbfc7..b10018a 100644
--- a/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h

@@ -45,7 +45,7 @@
      */
     static std::unique_ptr<IClMatMulNativeKernelConfig> create(GPUTarget gpu)
     {
-        switch(get_arch_from_target(gpu))
+        switch (get_arch_from_target(gpu))
         {
             case GPUTarget::MIDGARD:
             case GPUTarget::BIFROST:
@@ -56,6 +56,6 @@
         }
     }
 };
-} // namespace opencl
+} // namespace cl_matmul
 } // namespace arm_compute
 #endif /* SRC_RUNTIME_HEURISTICS_MATMUL_NATIVE_CLMATMULNATIVEKERNELCONFIG */

diff --git a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
index 4f548bd..b9b0911 100644
--- a/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h
+++ b/src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h

@@ -28,6 +28,7 @@
 #include "arm_compute/core/KernelDescriptors.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/function_info/MatMulInfo.h"
+
 #include "src/core/common/Macros.h"
 
 namespace arm_compute
@@ -53,8 +54,7 @@
      * @param[in] func_int8 Function to call for matmul native Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL)
      *
      */
-    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8)
-        : _configs{ func_f32, func_f16, func_int8 }
+    ClMatMulNativeConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8}
     {
     }
 
@@ -66,7 +66,7 @@
      */
     T get_function(DataType data_type)
     {
-        switch(data_type)
+        switch (data_type)
         {
             case DataType::F32:
                 return _configs.at(DT_F32);
@@ -93,8 +93,7 @@
      *
      * @param[in] arch GPU target
      */
-    IClMatMulNativeKernelConfig(GPUTarget arch)
-        : _target(arch)
+    IClMatMulNativeKernelConfig(GPUTarget arch) : _target(arch)
     {
     }
     ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClMatMulNativeKernelConfig);
commit	afd38f0c617d6f89b2b4532c6c44f116617e2b6f	[log] [tgz]
author	Felix Thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>	Wed Sep 27 17:46:17 2023 +0100
committer	felixjohnny.thomasmathibalan <felixjohnny.thomasmathibalan@arm.com>	Thu Sep 28 12:08:05 2023 +0000
tree	03bc7d5a762099989b16a656fa8d397b490ed70e
parent	bdcb4c148ee2fdeaaddf4cf1e57bbb0de02bb894 [diff]